diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000..e61dfc9c
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,59 @@
+---
+BasedOnStyle: Mozilla
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterClass: true
+  AfterControlStatement: true
+  AfterEnum: true
+  AfterFunction: true
+  AfterNamespace: false
+  AfterObjCDeclaration: false
+  AfterStruct: true
+  AfterUnion: true
+  AfterExternBlock: true
+  BeforeCatch: true
+  BeforeElse: true
+  IndentBraces: false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakBeforeInheritanceComma: false
+ColumnLimit: 120
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+IndentCaseLabels: true
+IndentWidth: 4
+KeepEmptyLinesAtTheStartOfBlocks: false
+Language: Cpp
+PointerAlignment: Left
+ReflowComments: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: Never
+SpaceInEmptyParentheses: false
+SpacesInContainerLiterals: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: "c++11"
+IncludeCategories:
+  - Regex:           '^".*"'
+    Priority:        1
+  - Regex:           '^<popsift/.*>'
+    Priority:        2
+  - Regex:           '^<.*\..*>'
+    Priority:        3
+SortIncludes: true
+IncludeBlocks: Regroup
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 00000000..a3b071e4
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,36 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: "[bug]"
+labels: bug
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Log**
+If applicable, copy paste the relevant log output (please embed the text in a markdown code tag "\`\`\`" )
+
+**Desktop (please complete the following and other pertinent information):**
+ - OS: [e.g. win 10, osx, ]
+ - PopSift version: please specify if you are using a release version or your own build
+   - Binary version (if applicable) [e.g. 2019.1]
+   - Commit reference (if applicable) [e.g. 08ddbe2]
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 00000000..52683c44
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: "[request]"
+labels: feature request
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/.github/ISSUE_TEMPLATE/question_help.md b/.github/ISSUE_TEMPLATE/question_help.md
new file mode 100644
index 00000000..71035cde
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/question_help.md
@@ -0,0 +1,29 @@
+---
+name: Question or help needed
+about: Ask question or for help for issues not related to program failures (e.g. "where I can find this feature", "my image is not recognized", "which parameter setting shall I use" etc...)
+title: "[question]"
+labels: type:question
+assignees: ''
+
+---
+
+**Describe the problem**
+A clear and concise description of what the problem is.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Dataset**
+If applicable, add a link or *few* images to help better understand where the problem may come from.
+
+**Log**
+If applicable, copy paste the relevant log output (please embed the text in a markdown code tag "\`\`\`" )
+
+**Desktop (please complete the following and other pertinent information):**
+ - OS: [e.g. win 10, osx, ]
+ - PopSift version: please specify if you are using a release version or your own build
+   - Binary version (if applicable) [e.g. 2019.1]
+   - Commit reference (if applicable) [e.g. 08ddbe2]
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 00000000..de3799ad
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,29 @@
+<!-- Checklist before submission:
+
+ - I have read the [contribution guidelines](../CONTRIBUTING.md).
+ - I have updated the documentation, if applicable.
+ - I have ensured that the change is tested somewhere.
+ - I have followed the prevailing code style (for history readability and limit conflicts for maintainance).
+
+-->
+## Description
+
+
+
+## Features list
+
+<!--
+- [ ] Feature one. Fix #XXX
+- [ ] Improve something else
+- [ ] Connect to #3 (to declare link to issues without closing it when the PR is merged).
+- [X] Add "X" when it is done.
+-->
+
+
+## Implementation remarks
+
+
+<!--
+Explain main implementation choices.
+It is also the right place to ask for feedback and help when you hesitate on the implementation.
+-->
diff --git a/.github/stale.yml b/.github/stale.yml
new file mode 100644
index 00000000..006179d6
--- /dev/null
+++ b/.github/stale.yml
@@ -0,0 +1,28 @@
+# Number of days of inactivity before an issue becomes stale
+daysUntilStale: 120
+# Number of days of inactivity before a stale issue is closed
+daysUntilClose: 7
+# Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
+exemptLabels:
+  - "do not close"
+
+# Set to true to ignore issues in a project (defaults to false)
+exemptProjects: true
+
+# Set to true to ignore issues in a milestone (defaults to false)
+exemptMilestones: true
+
+# Set to true to ignore issues with an assignee (defaults to false)
+exemptAssignees: true
+
+# Label to use when marking an issue as stale
+staleLabel: stale
+# Comment to post when marking an issue as stale. Set to `false` to disable
+markComment: >
+  This issue has been automatically marked as stale because it has not had
+  recent activity. It will be closed if no further activity occurs. Thank you
+  for your contributions.
+# Comment to post when closing a stale issue. Set to `false` to disable
+closeComment: >
+  This issue is closed due to inactivity. Feel free to re-open if new information
+  is available.
diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml
new file mode 100644
index 00000000..3f01875e
--- /dev/null
+++ b/.github/workflows/continuous-integration.yml
@@ -0,0 +1,69 @@
+name: Continuous Integration
+
+on:
+  push:
+    branches:
+      - master
+      - develop
+    # Skip jobs when only documentation files are changed
+    paths-ignore:
+      - '**.md'
+      - '**.rst'
+      - 'docs/**'
+  pull_request:
+    paths-ignore:
+      - '**.md'
+      - '**.rst'
+      - 'docs/**'
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        container: ["alicevision/popsift-deps:cuda11.8.0-ubuntu20.04", "alicevision/popsift-deps:cuda12.1.0-ubuntu22.04"]
+        build_tpe: ["Release", "Debug"]
+        exclude:
+        # excludes debug on this one as it has a segmentation fault during the compilation (!)
+        - container: "alicevision/popsift-deps:cuda11.8.0-ubuntu20.04"
+          build_tpe: "Debug"
+
+    container:
+      image: ${{ matrix.container }}
+
+    env:
+      DEPS_INSTALL_DIR: /opt/
+      BUILD_TYPE: ${{ matrix.build_tpe }}
+      CTEST_OUTPUT_ON_FAILURE: 1
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Prepare File Tree
+        run: |
+          mkdir ./build
+          mkdir ./build_as_3rdparty
+          mkdir ../popsift_install
+
+      - name: Configure CMake
+        working-directory: ./build
+        run: |
+          cmake .. \
+           -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+           -DBUILD_SHARED_LIBS:BOOL=ON \
+           -DCMAKE_PREFIX_PATH="${DEPS_INSTALL_DIR}" \
+           -DPopSift_BUILD_DOCS:BOOL=OFF \
+           -DCMAKE_INSTALL_PREFIX:PATH=$PWD/../../popsift_install
+
+      - name: Build
+        working-directory: ./build
+        run: |
+          make -j$(nproc) install
+
+      - name: Build As Third Party
+        working-directory: ./build_as_3rdparty
+        run: |
+          cmake ../src/application \
+           -DBUILD_SHARED_LIBS:BOOL=ON \
+           -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+           -DCMAKE_PREFIX_PATH:PATH="$PWD/../../popsift_install;${DEPS_INSTALL_DIR}"
+          make -j$(nproc)
diff --git a/.gitignore b/.gitignore
index d93abdc6..c44a8393 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ popsift/playground/try-gauss
 popsift/playground/try-gauss-param
 popsift/playground/try-gauss-interpolate
 popsift/playground/try-libav
+oxford
 
 # Prerequisites
 *.d
@@ -44,3 +45,5 @@ popsift/playground/try-libav
 # Temporary files
 .DS_Store
 
+# Downloaded archives for tests.
+*.tgz
diff --git a/.readthedocs.yml b/.readthedocs.yml
new file mode 100644
index 00000000..79484e63
--- /dev/null
+++ b/.readthedocs.yml
@@ -0,0 +1,19 @@
+# .readthedocs.yml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: doc/sphinx/source/conf.py
+
+# Optionally build your docs in additional formats such as PDF and ePub
+formats: all
+
+# Optionally set the version of Python and requirements required to build your docs
+python:
+  version: 3.7
+  install:
+    - requirements: doc/sphinx/requirements.txt
diff --git a/.travis.yml b/.travis.yml
index 45e77f20..030cb775 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,40 +1,38 @@
-dist: trusty
-sudo: required
+dist: jammy
 
 language: cpp
 compiler: gcc
 
-cache:
-  apt: true
-
 addons:
   apt:
     packages:
-      - libboost-filesystem1.55-dev 
-      - libboost-system1.55-dev
-      - libboost-program-options1.55-dev
-      - libboost-thread1.55-dev
+      - libboost-filesystem-dev
+      - libboost-system-dev
+      - libboost-program-options-dev
+      - libboost-thread-dev
 
 env:
   matrix:
-    - CUDA_VERSION_MAJOR="7" CUDA_VERSION_MINOR="0" CUDA_PKG_LONGVERSION="${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}-28" CUDA_PKG_VERSION="${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}"
-    - CUDA_VERSION_MAJOR="7" CUDA_VERSION_MINOR="5" CUDA_PKG_LONGVERSION="${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}-18" CUDA_PKG_VERSION="${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}"
-    - CUDA_VERSION_MAJOR="8" CUDA_VERSION_MINOR="0" CUDA_PKG_LONGVERSION="${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.61-1" CUDA_PKG_VERSION="${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}" 
-  
+    - CUDA_VERSION_MAJOR="11" CUDA_VERSION_MINOR="8" CUDA_PKG_LONGVERSION="${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.0-1" CUDA_PKG_VERSION="${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}"
+    - CUDA_VERSION_MAJOR="12" CUDA_VERSION_MINOR="5" CUDA_PKG_LONGVERSION="${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.1-1" CUDA_PKG_VERSION="${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}"
+
+
   global:
     - NUM_CPU="`grep processor /proc/cpuinfo | wc -l`"; echo $NUM_CPU
-    - BUILD_TYPE="RELEASE"
     - BUILD_SYSTEM="`uname -s`"
     - BUILD_PROCESSOR="`uname -p`"
     - POPSIFT_SOURCE=${TRAVIS_BUILD_DIR}
-    - POPSIFT_BUILD=${TRAVIS_BUILD_DIR}/build
-    - POPSIFT_INSTALL=${POPSIFT_BUILD}/install
+    - POPSIFT_BUILD_RELEASE=${TRAVIS_BUILD_DIR}/build_release
+    - POPSIFT_BUILD_DEBUG=${TRAVIS_BUILD_DIR}/build_debug
+    - POPSIFT_INSTALL_RELEASE=${POPSIFT_BUILD_RELEASE}/install
+    - POPSIFT_INSTALL_DEBUG=${POPSIFT_BUILD_DEBUG}/install
     - POPSIFT_APP_SRC=${POPSIFT_SOURCE}/src/application
-    - POPSIFT_APP_BUILD=${POPSIFT_APP_SRC}/build
-    - POPSIFT_APP_INSTALL=${POPSIFT_APP_BUILD}/install
+    - POPSIFT_APP_BUILD_RELEASE=${POPSIFT_APP_SRC}/build_release
+    - POPSIFT_APP_BUILD_DEBUG=${POPSIFT_APP_SRC}/build_debug
+    - POPSIFT_APP_INSTALL_RELEASE=${POPSIFT_APP_BUILD_RELEASE}/install
+    - POPSIFT_APP_INSTALL_DEBUG=${POPSIFT_APP_BUILD_DEBUG}/install
     # CMAKE
-    # - CMAKE_URL="https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.tar.gz"
-    - CMAKE_URL="https://cmake.org/files/v3.4/cmake-3.4.1-Linux-x86_64.tar.gz"
+    - CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3.30.1-linux-x86_64.tar.gz"
     - CMAKE_ROOT=${TRAVIS_BUILD_DIR}/cmake
     - CMAKE_SOURCE=${CMAKE_ROOT}/source
     - CMAKE_INSTALL=${CMAKE_ROOT}/install
@@ -57,34 +55,60 @@ before_install:
     fi
 
 install:
-  - CUDA_REPO_PKG=cuda-repo-ubuntu1404_${CUDA_PKG_LONGVERSION}_amd64.deb
-  - wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/$CUDA_REPO_PKG
-  - sudo dpkg -i $CUDA_REPO_PKG
-  - rm ${CUDA_REPO_PKG}
-  - sudo apt-get -y update
-  - sudo apt-get install -y --no-install-recommends  cuda-core-$CUDA_PKG_VERSION  cuda-cudart-dev-$CUDA_PKG_VERSION  cuda-cublas-dev-$CUDA_PKG_VERSION cuda-curand-dev-$CUDA_PKG_VERSION
-  - sudo ln -s /usr/local/cuda-${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} /usr/local/cuda
+  - UBUNTU_VERSION=ubuntu2204
+  - wget https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/cuda-${UBUNTU_VERSION}.pin
+  - travis_retry sudo mv cuda-${UBUNTU_VERSION}.pin /etc/apt/preferences.d/cuda-repository-pin-600
+  - travis_retry sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/3bf863cc.pub
+  - travis_retry sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/ /"
+  - sudo apt-get update && sudo apt-get -y install cuda
+#  - CUDA_REPO_PKG=cuda-repo-${UBUNTU_VERSION}_${CUDA_PKG_LONGVERSION}_amd64.deb
+#  - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/$CUDA_REPO_PKG
+
+
 
 before_script:
-  # Create build folder
-  - mkdir -p ${POPSIFT_BUILD}
-  - cd ${POPSIFT_BUILD}
   # Classic release build
+  # Create build folder
+  - mkdir -p ${POPSIFT_BUILD_RELEASE}
+  - cd ${POPSIFT_BUILD_RELEASE}
   - >
-     cmake . ${POPSIFT_SOURCE} -DCMAKE_INSTALL_PREFIX=${POPSIFT_INSTALL}
+     cmake . ${POPSIFT_SOURCE} -DCMAKE_INSTALL_PREFIX=${POPSIFT_INSTALL_RELEASE} -DCMAKE_BUILD_TYPE=Release -DPopSift_BUILD_DOCS:BOOL=OFF
+
+  # Classic debug build
+  # Create build folder
+  - mkdir -p ${POPSIFT_BUILD_DEBUG}
+  - cd ${POPSIFT_BUILD_DEBUG}
+  - >
+   cmake . ${POPSIFT_SOURCE} -DCMAKE_INSTALL_PREFIX=${POPSIFT_INSTALL_DEBUG} -DCMAKE_BUILD_TYPE=Debug -DPopSift_BUILD_DOCS:BOOL=OFF
 
 script:
+  - cd ${POPSIFT_BUILD_RELEASE}
 # limit GCC builds to a reduced number of thread for the virtual machine
   - make install -j 2 VERBOSE=1
 # Perform unit tests
   # - make test
 # Perform tests building application with PopSift as 3rd party
   - cd ${POPSIFT_APP_SRC}
-  - mkdir -p ${POPSIFT_APP_BUILD}
-  - cd ${POPSIFT_APP_BUILD}
-  - cmake .. -DPopSift_DIR=${POPSIFT_INSTALL}/lib/cmake/PopSift/ -DCMAKE_INSTALL_PREFIX=${POPSIFT_APP_INSTALL}
+  - mkdir -p ${POPSIFT_APP_BUILD_RELEASE}
+  - cd ${POPSIFT_APP_BUILD_RELEASE}
+  - cmake .. -DPopSift_DIR=${POPSIFT_INSTALL_RELEASE}/lib/cmake/PopSift/ -DCMAKE_INSTALL_PREFIX=${POPSIFT_APP_INSTALL_RELEASE} -DCMAKE_BUILD_TYPE=Release -DPopSift_BUILD_DOCS:BOOL=OFF
+  - make install -j 2 VERBOSE=1
+
+# same for debug
+  - cd ${POPSIFT_BUILD_DEBUG}
+  # limit GCC builds to a reduced number of thread for the virtual machine
+  - make install -j 2 VERBOSE=1
+  # Perform unit tests
+  # - make test
+  # Perform tests building application with PopSift as 3rd party
+  - cd ${POPSIFT_APP_SRC}
+  - mkdir -p ${POPSIFT_APP_BUILD_DEBUG}
+  - cd ${POPSIFT_APP_BUILD_DEBUG}
+  - cmake .. -DPopSift_DIR=${POPSIFT_INSTALL_DEBUG}/lib/cmake/PopSift/ -DCMAKE_INSTALL_PREFIX=${POPSIFT_APP_INSTALL_DEBUG} -DCMAKE_BUILD_TYPE=Debug -DPopSift_BUILD_DOCS:BOOL=OFF
   - make install -j 2 VERBOSE=1
 
 cache:
+  apt: true
   directories:
     - ${CMAKE_INSTALL}
+
diff --git a/CHANGES.md b/CHANGES.md
new file mode 100644
index 00000000..1cfc0b51
--- /dev/null
+++ b/CHANGES.md
@@ -0,0 +1,71 @@
+# PopSift Changelog
+
+All notable changes to this project are documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+### Added
+
+### Changed
+
+### Fixed
+
+### Removed
+
+## [1.0.0] - YYYY-MM-DD
+
+## 2024
+
+- CMake: CUDA as first-order language, different CC selection
+
+### Added
+- Improved checks for CUDA textures [PR](https://github.com/alicevision/popsift/pull/89)
+- CMake: Improved support for all Cuda CC [PR](https://github.com/alicevision/popsift/pull/75)
+- CMake: support for cuda 11 [PR](https://github.com/alicevision/popsift/pull/103)
+- Support for Cuda CC 7 cards (RTX 2080) [PR](https://github.com/alicevision/popsift/pull/67)
+- Support for Boost 1.70 [PR](https://github.com/alicevision/popsift/pull/65)
+- Support for device selection and multiple GPUs [PR](https://github.com/alicevision/popsift/pull/121)
+
+### Fixed
+- CMake: fixes to allow building on Windows using vcpkg [PR](https://github.com/alicevision/popsift/pull/92)
+- Fix race condition [PR](https://github.com/alicevision/popsift/pull/82)
+
+### Changed
+- Improved resource releasing [PR](https://github.com/alicevision/popsift/pull/71)
+
+### Removed
+- Remove boost dependency from the main library [PR](https://github.com/alicevision/popsift/pull/81)
+
+
+## 2019
+
+- Bugfix: Support for images with different resolutions [PR](https://github.com/alicevision/popsift/pull/58)
+
+
+## 2018
+
+- CMake: Auto-build export symbols for shared libs on Windows [PR](https://github.com/alicevision/popsift/pull/54)
+- Bugfix: freeing page-aligned memory on Win32 [PR](https://github.com/alicevision/popsift/pull/53)
+- Paper published @MMSys18 (https://dl.acm.org/doi/10.1145/3204949.3208136)
+- Docker support [PR](https://github.com/alicevision/popsift/pull/46)
+- Appveyor CI windows [PR](https://github.com/alicevision/popsift/pull/41)
+- Support for Cuda 9 [PR](https://github.com/alicevision/popsift/pull/38)
+- Thrust with Cuda 7 [PR](https://github.com/alicevision/popsift/pull/35)
+
+
+## 2017
+
+- Grid filtering [PR](https://github.com/alicevision/popsift/pull/30)
+- Improved Gauss filtering [PR](https://github.com/alicevision/popsift/pull/24)
+- Support asynchronous SIFT extraction [PR](https://github.com/alicevision/popsift/pull/22)
+- Windows port [PR](https://github.com/alicevision/popsift/pull/18)
+
+
+## 2016
+
+- Switch to modern CMake [PR](https://github.com/alicevision/popsift/pull/14)
+- Travis CI Linux [PR](https://github.com/alicevision/popsift/pull/8)
+ - First open-source release
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6ae39085..3e9138a8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,146 +1,216 @@
-# CMake below 3.4 does not work with CUDA separable compilation at all
-cmake_minimum_required(VERSION 3.4)
+# First-order language CUDA requires at least CMake 3.18
+cmake_minimum_required(VERSION 3.24)
+
+# Default seletion of CUDA Compute Capabilities.
+# This must be called before project() or cmake sets it to the oldest non-deprecated CC
+# "all" and "all-major" work for Intel and perhaps for ARM with discrete GPUs, but not Tegra and Jetson.
+if(EXISTS "/etc/nv_tegra_release")
+  # The CC list for Tegras and Jetson will require manual updates
+  set(CMAKE_CUDA_ARCHITECTURES "53;62;72;87"
+      CACHE
+      STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
+else()
+  # The CC list for discrete GPUs will require CMake updates
+  set(CMAKE_CUDA_ARCHITECTURES "all-major"
+      CACHE
+      STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
+endif()
+
+project(PopSift VERSION 1.0.0 LANGUAGES CXX CUDA)
 
-project(PopSift VERSION 1.0.0)
+# Policy to support CUDA as a first-order language for CMake.
+# Since CMake 3.18. See https://cmake.org/cmake/help/latest/policy/CMP0104.html
+cmake_policy(SET CMP0104 NEW)
 
-OPTION(PopSift_BUILD_EXAMPLES "Build PopSift applications."  ON)
-OPTION(PopSift_USE_NVTX_PROFILING     "Use CUDA NVTX for profiling." OFF)
-OPTION(PopSift_ERRCHK_AFTER_KERNEL     "Synchronize and check CUDA error after every kernel." OFF)
-OPTION(PopSift_USE_POSITION_INDEPENDENT_CODE "Generate position independent code." ON)
-OPTION(PopSift_USE_GRID_FILTER "Switch off grid filtering to massively reduce compile time while debugging other things." ON)
+# Set build path as a folder named as the platform (linux, windows, darwin...) plus the processor type
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
 
-if(PopSift_USE_POSITION_INDEPENDENT_CODE)
+option(PopSift_BUILD_EXAMPLES "Build PopSift applications."  ON)
+option(PopSift_BUILD_DOCS "Build PopSift documentation."  OFF)
+option(PopSift_USE_NVTX_PROFILING     "Use CUDA NVTX for profiling." OFF)
+option(PopSift_ERRCHK_AFTER_KERNEL     "Synchronize and check CUDA error after every kernel." OFF)
+option(PopSift_USE_POSITION_INDEPENDENT_CODE "Generate position independent code." ON)
+option(PopSift_USE_GRID_FILTER "Switch off grid filtering to massively reduce compile time while debugging other things." ON)
+option(PopSift_NVCC_WARNINGS "Switch on several additional warning for CUDA nvcc" OFF)
+option(PopSift_USE_TEST_CMD "Add testing step for functional verification" OFF)
+option(BUILD_SHARED_LIBS "Build shared libraries" ON)
+
+if(PopSift_USE_POSITION_INDEPENDENT_CODE AND NOT MSVC)
   set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 endif()
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake")
 
 # set(CMAKE_BUILD_TYPE Debug)
-if(NOT CMAKE_BUILD_TYPE)
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   set(CMAKE_BUILD_TYPE Release)
   message(STATUS "Build type not set, building in Release configuration")
+  # Set the possible values of build type for cmake-gui
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 else()
   message(STATUS "Building in ${CMAKE_BUILD_TYPE} configuration")
 endif()
 
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
-set(CMAKE_C_FLAGS_RELEASE   "${CMAKE_C_FLAGS_RELEASE}   -O3")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
+# ensure the proper linker flags when building the static version on MSVC
+if(MSVC AND NOT BUILD_SHARED_LIBS)
+  foreach(config "DEBUG" "RELEASE" "MINSIZEREL" "RELWITHDEBINFO")
+    string(REPLACE /MD /MT CMAKE_C_FLAGS_${config} "${CMAKE_C_FLAGS_${config}}")
+    string(REPLACE /MD /MT CMAKE_CXX_FLAGS_${config} "${CMAKE_CXX_FLAGS_${config}}")
+    message(STATUS "CMAKE_C_FLAGS_${config} ${CMAKE_C_FLAGS_${config}}")
+    message(STATUS "CMAKE_CXX_FLAGS_${config} ${CMAKE_CXX_FLAGS_${config}}")
+  endforeach()
+endif()
+
+# ==============================================================================
+# GNUInstallDirs CMake module
+# - Define GNU standard installation directories
+# - Provides install directory variables as defined by the GNU Coding Standards.
+# ==============================================================================
+include(GNUInstallDirs)
 
-# set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -G")
-# set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG}   -G")
 
-find_package(Boost 1.53.0 REQUIRED COMPONENTS system thread)
-if(WIN32)
-  add_definitions("-DBOOST_ALL_NO_LIB")
-  link_directories(Boost_LIBRARRY_DIR_DEBUG)
-  link_directories(Boost_LIBRARRY_DIR_RELEASE)
-endif(WIN32)
+if(BUILD_SHARED_LIBS)
+  message(STATUS "BUILD_SHARED_LIBS ON")
 
-find_package(CUDA 7.0 REQUIRED)
+  # Auto-build dll exports on Windows
+  set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
-if(NOT CUDA_FOUND)
-  message(FATAL_ERROR "Could not find CUDA >= 7.0")
+  set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
+else()
+  message(STATUS "BUILD_SHARED_LIBS OFF")
+
+  set(CMAKE_CUDA_RUNTIME_LIBRARY Static)
 endif()
 
-#
-# Default setting of the CUDA CC versions to compile.
-# Shortening the lists saves a lot of compile time.
-#
-if(CUDA_VERSION_MAJOR GREATER 7)
-  set(PopSift_CUDA_CC_LIST_BASIC 30 35 50 52 60 61 62)
-else(CUDA_VERSION_MAJOR GREATER 7)
-  set(PopSift_CUDA_CC_LIST_BASIC 30 35 50 52 )
-endif(CUDA_VERSION_MAJOR GREATER 7)
-set(PopSift_CUDA_CC_LIST ${PopSift_CUDA_CC_LIST_BASIC} CACHE STRING "CUDA CC versions to compile")
+# Require threads because of std::thread.
+find_package(Threads REQUIRED)
+
+###################
+#  CUDA
+###################
+include(CheckLanguage)
+check_language(CUDA)
+
+# Use this if necessary: "cmake -DCUDAToolkit_ROOT=/some/path"
+# target_link_libraries(binary_linking_to_cudart PRIVATE CUDA::cudart)
+find_package(CUDAToolkit)
+
+message(STATUS "CUDA Version is ${CUDAToolkit_VERSION}")
+set(CUDA_VERSION ${CUDAToolkit_VERSION})
 
 if(PopSift_USE_NVTX_PROFILING)
   message(STATUS "PROFILING CPU CODE: NVTX is in use")
-endif(PopSift_USE_NVTX_PROFILING)
+endif()
 
 if(PopSift_ERRCHK_AFTER_KERNEL)
   message(STATUS "Synchronizing and checking errors after every kernel call")
-  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-DERRCHK_AFTER_KERNEL")
-endif(PopSift_ERRCHK_AFTER_KERNEL)
-
-set(CUDA_SEPARABLE_COMPILATION ON)
-
-if(UNIX AND NOT APPLE)
-  set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};-Xcompiler;-rdynamic;-lineinfo")
-  # set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};-Xptxas;-v")
-  # set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};-Xptxas;-warn-double-usage")
-  set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};--keep")
-  set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};--source-in-ptx")
-endif(UNIX AND NOT APPLE)
-
-# The following if should not be necessary, but apparently there is a bug in FindCUDA.cmake that
-# generate an empty string in the nvcc command line causing the compilation to fail.
-# see https://gitlab.kitware.com/cmake/cmake/issues/16411
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-  message(STATUS "Building in debug mode")
-  set(CUDA_NVCC_FLAGS_DEBUG   "${CUDA_NVCC_FLAGS_DEBUG};-G")
-endif()
-set(CUDA_NVCC_FLAGS_RELEASE "${CUDA_NVCC_FLAGS_RELEASE};-O3")
-
-if(PopSift_USE_POSITION_INDEPENDENT_CODE)
-  set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};-Xcompiler;-fPIC")
-endif()
-
-#
-# Add all requested CUDA CCs to the command line for offline compilation
-#
-list(SORT PopSift_CUDA_CC_LIST)
-foreach(PopSift_CC_VERSION ${PopSift_CUDA_CC_LIST})
-  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-gencode;arch=compute_${PopSift_CC_VERSION},code=sm_${PopSift_CC_VERSION}")
-endforeach(PopSift_CC_VERSION)
-
-#
-# Use the highest request CUDA CC for CUDA JIT compilation
-#
-list(LENGTH PopSift_CUDA_CC_LIST PopSift_CC_LIST_LEN)
-MATH(EXPR PopSift_CC_LIST_LEN "${PopSift_CC_LIST_LEN}-1")
-list(GET PopSift_CUDA_CC_LIST ${PopSift_CC_LIST_LEN} PopSift_CUDA_CC_LIST_LAST)
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-gencode;arch=compute_${PopSift_CUDA_CC_LIST_LAST},code=compute_${PopSift_CUDA_CC_LIST_LAST}")
-
-# default stream legacy implies that the 0 stream synchronizes all streams
+  list(APPEND CUDA_NVCC_FLAGS "-DERRCHK_AFTER_KERNEL")
+endif()
+
+# This may not be required any more.
+set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+
 # default stream per-thread implies that each host thread has one non-synchronizing 0-stream
 # currently, the code requires legacy mode
-set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};--default-stream;legacy")
+list(APPEND CUDA_NVCC_FLAGS "--default-stream;legacy")
 # set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};--default-stream;per-thread")
 
-message(STATUS "CUDA Version is ${CUDA_VERSION}")
-message(STATUS "Compiling for CUDA CCs: ${PopSift_CUDA_CC_LIST}")
-if(CUDA_VERSION>=7.5)
-  set(CUDA_NVCC_FLAGS_RELEASE "${CUDA_NVCC_FLAGS_RELEASE};-Xptxas;-warn-lmem-usage")
-  set(CUDA_NVCC_FLAGS_RELEASE "${CUDA_NVCC_FLAGS_RELEASE};-Xptxas;-warn-spills")
-  set(CUDA_NVCC_FLAGS_RELEASE "${CUDA_NVCC_FLAGS_RELEASE};-Xptxas;--warn-on-local-memory-usage")
-  set(CUDA_NVCC_FLAGS_RELEASE "${CUDA_NVCC_FLAGS_RELEASE};-Xptxas;--warn-on-spills")
-else(CUDA_VERSION>=7.5)
-endif(CUDA_VERSION>=7.5)
+if(CUDA_VERSION VERSION_GREATER_EQUAL "7.5")
+  if(PopSift_NVCC_WARNINGS)
+    list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-lmem-usage")
+    list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-spills")
+    list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-local-memory-usage")
+    list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-spills")
+  endif()
+endif()
 
-# library required for CUDA dynamic parallelism, forgotten by CMake 3.4
-cuda_find_library_local_first(CUDA_CUDADEVRT_LIBRARY cudadevrt "\"cudadevrt\" library")
+set(PopSift_CXX_STANDARD 17) # Thrust/CUB requires C++14 starting with CUDA SDK 11
+if(CUDA_VERSION_MAJOR LESS_EQUAL 8)
+  set(PopSift_CXX_STANDARD 11)
+endif()
 
-if(PopSift_USE_NVTX_PROFILING)
-  # library required for NVTX profiling of the CPU
-  cuda_find_library_local_first(CUDA_NVTX_LIBRARY nvToolsExt "NVTX library")
-  add_definitions(-DUSE_NVTX)
-endif(PopSift_USE_NVTX_PROFILING)
+if(NOT MSVC)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${PopSift_CXX_STANDARD}")
+      list(APPEND CUDA_NVCC_FLAGS "-std=c++${PopSift_CXX_STANDARD}")
+endif()
+set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD})
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_STANDARD ${PopSift_CXX_STANDARD})
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+
+if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
+  set(PopSift_HAVE_SHFL_DOWN_SYNC   1)
+else()
+  set(PopSift_HAVE_SHFL_DOWN_SYNC   0)
+endif()
 
 if(NOT PopSift_USE_GRID_FILTER)
   message(STATUS "Disabling grid filter compilation")
-  add_definitions(-DDISABLE_GRID_FILTER)
-endif(NOT PopSift_USE_GRID_FILTER)
+  set(DISABLE_GRID_FILTER   1)
+else()
+  set(DISABLE_GRID_FILTER   0)
+endif()
+
+if(PopSift_USE_NVTX_PROFILING)
+  # library required for NVTX profiling of the CPU
+  set(PopSift_USE_NVTX 1)
+else()
+  set(PopSift_USE_NVTX 0)
+endif()
 
 add_subdirectory(src)
 
+if(PopSift_BUILD_DOCS)
+  add_subdirectory(doc)
+endif()
+
+set(PopSift_TESTFILE_PATH "popsift-samples/datasets/sample/big_set/" CACHE STRING "Base directory where your test files are stored")
+if(PopSift_USE_TEST_CMD)
+  if(NOT IS_ABSOLUTE("${PopSift_TESTFILE_PATH}"))
+    get_filename_component(PopSift_TESTFILES "${PopSift_TESTFILE_PATH}" ABSOLUTE)
+    set(PopSift_TESTFILE_PATH "${PopSift_TESTFILES}")
+  endif()
+
+  add_subdirectory(testScripts)
+endif()
+
 ########### Add uninstall target ###############
 CONFIGURE_FILE(
   "${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in"
   "${CMAKE_CURRENT_BINARY_DIR}/cmake/cmake_uninstall.cmake"
   IMMEDIATE @ONLY)
 ADD_CUSTOM_TARGET(uninstall
-  "${CMAKE_COMMAND}" -P "${CMAKE_CURRENT_BINARY_DIR}/cmake/cmake_uninstall.cmake") 
-
+  "${CMAKE_COMMAND}" -P "${CMAKE_CURRENT_BINARY_DIR}/cmake/cmake_uninstall.cmake")
+
+
+######################################
+# SUMMARY
+######################################
+message("\n")
+message("******************************************")
+message("Building configuration:\n")
+message(STATUS "PopSift version: " ${PROJECT_VERSION})
+message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
+message(STATUS "Build Shared libs: " ${BUILD_SHARED_LIBS})
+message(STATUS "Build examples: " ${PopSift_BUILD_EXAMPLES})
+message(STATUS "Build documentation: " ${PopSift_BUILD_DOCS})
+message(STATUS "Generate position independent code: " ${CMAKE_POSITION_INDEPENDENT_CODE})
+message(STATUS "Use CUDA NVTX for profiling: " ${PopSift_USE_NVTX_PROFILING})
+message(STATUS "Synchronize and check CUDA error after every kernel: " ${PopSift_ERRCHK_AFTER_KERNEL})
+message(STATUS "Grid filtering: " ${PopSift_USE_GRID_FILTER})
+message(STATUS "Additional warning for CUDA nvcc: " ${PopSift_NVCC_WARNINGS})
+message(STATUS "Install path: " ${CMAKE_INSTALL_PREFIX})
+message(STATUS "Testing step: " ${PopSift_USE_TEST_CMD})
+
+message(STATUS "CMAKE_CUDA_COMPILER = ${CMAKE_CUDA_COMPILER}")
+message(STATUS "CMAKE_CUDA_COMPILER_ID = ${CMAKE_CUDA_COMPILER_ID}")
+message(STATUS "CMAKE_CUDA_COMPILER_VERSION = ${CMAKE_CUDA_COMPILER_VERSION}")
+message(STATUS "CMAKE_CUDA_ARCHITECTURES = ${CMAKE_CUDA_ARCHITECTURES}")
+
+if(PopSift_USE_TEST_CMD)
+  message(STATUS "Path for test input: " ${PopSift_TESTFILE_PATH})
+endif()
+message("\n******************************************")
+message("\n")
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..e2233aee
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,74 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, gender identity and expression, level of experience,
+nationality, personal appearance, race, religion, or sexual identity and
+orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team privately at alicevision-team@googlegroups.com. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct/
+
+[homepage]: https://www.contributor-covenant.org
+
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..384a2781
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,35 @@
+Contributing to PopSift
+===========================
+
+PopSift relies on a friendly and community-driven effort to create an open source photogrammetry solution.
+In order to foster a friendly atmosphere where technical collaboration can flourish,
+we recommend you to read the [code of conduct](CODE_OF_CONDUCT.md).
+
+
+Contributing Workflow
+---------------------
+
+The contributing workflow relies on [Github Pull Requests](https://help.github.com/articles/using-pull-requests/).
+
+1. If it is an important change, we recommend you to discuss it on the mailing-list before starting implementation. 
+  This ensure that the development is aligned with other
+developpements already started and will be efficiently integrated.
+
+2. Create the corresponding issues.
+
+3. Create a branch and start a PR starting by [WIP], like "[WIP] My new feature" so everyone can follow the development.
+  Explain the implementation in the PR description with links to issues.
+
+4. Implement the new feature(s). Add unit test if needed.
+One feature per PR is ideal for review, but linked features can be part of the same PR.
+
+5. When it is ready for review, remove "[WIP]" from the PR name.
+
+6. The reviewers will look over the code and ask for changes, explain problems they found,
+congratulate the author, etc. using the github comments.
+
+7. After approval, one of the developers with commit approval to the official main repository
+will merge your fixes into the "develop" branch.
+
+8. If not already the case, your name will be added to the [contributors list](CONTRIBUTORS.md).
+
diff --git a/LICENSE.md b/COPYING.md
similarity index 100%
rename from LICENSE.md
rename to COPYING.md
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..5b691c45
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,25 @@
+ARG CUDA_TAG=10.2
+ARG OS_TAG=18.04
+FROM alicevision/popsift-deps:cuda${CUDA_TAG}-ubuntu${OS_TAG}
+LABEL maintainer="AliceVision Team alicevision@googlegroups.com"
+
+# use CUDA_TAG to select the image version to use
+# see https://hub.docker.com/r/nvidia/cuda/
+#
+# For example, to create a ubuntu 16.04 with cuda 8.0 for development, use
+# docker build --build-arg CUDA_TAG=8.0-devel --tag popsift .
+#
+# then execute with nvidia docker (https://github.com/nvidia/nvidia-docker/wiki/Installation-(version-2.0))
+# docker run -it --runtime=nvidia popsift
+
+
+# OS/Version (FILE): cat /etc/issue.net
+# Cuda version (ENV): $CUDA_VERSION
+
+# System update
+COPY . /opt/popsift
+WORKDIR /opt/popsift/build
+RUN cmake .. -DCMAKE_BUILD_TYPE=Release && \
+    make install -j $(nproc) && \
+    cd /opt && \
+    rm -rf popsift
diff --git a/Dockerfile_deps b/Dockerfile_deps
new file mode 100644
index 00000000..d5560d4e
--- /dev/null
+++ b/Dockerfile_deps
@@ -0,0 +1,45 @@
+ARG CUDA_TAG=10.2
+ARG OS_TAG=18.04
+FROM nvidia/cuda:${CUDA_TAG}-devel-ubuntu${OS_TAG}
+LABEL maintainer="AliceVision Team alicevision@googlegroups.com"
+
+# use CUDA_TAG to select the image version to use
+# see https://hub.docker.com/r/nvidia/cuda/
+#
+# For example, to create a ubuntu 16.04 with cuda 8.0 for development, use
+# docker build --build-arg CUDA_TAG=8.0 --tag alicevision/popsift-deps:cuda${CUDA_TAG}-ubuntu${OS_TAG} -f Dockerfile_deps .
+#
+# then execute with nvidia docker (https://github.com/nvidia/nvidia-docker/wiki/Installation-(version-2.0))
+# docker run -it --runtime=nvidia popsift_deps
+
+
+# OS/Version (FILE): cat /etc/issue.net
+# Cuda version (ENV): $CUDA_VERSION
+
+# System update
+RUN apt-get clean && apt-get update && apt-get install -y --no-install-recommends\
+        build-essential \
+        wget \
+        unzip \
+        libtool \
+        automake \
+        libssl-dev \
+        libjpeg-turbo8-dev \
+        libdevil-dev \
+        libboost-filesystem-dev \
+        libboost-system-dev \
+        libboost-program-options-dev \
+        libboost-thread-dev \
+ && rm -rf /var/lib/apt/lists/*
+ 
+# Manually install cmake
+WORKDIR /tmp/cmake
+ENV CMAKE_VERSION=3.24
+ENV CMAKE_VERSION_FULL=${CMAKE_VERSION}.2
+RUN wget https://cmake.org/files/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION_FULL}.tar.gz && \
+    tar zxf cmake-${CMAKE_VERSION_FULL}.tar.gz && \
+    cd cmake-${CMAKE_VERSION_FULL} && \
+    ./bootstrap --prefix=/usr/local  -- -DCMAKE_BUILD_TYPE:STRING=Release -DCMAKE_USE_OPENSSL:BOOL=ON && \
+    make -j$(nproc) install && \
+    cd /tmp && \
+    rm -rf cmake
diff --git a/README.md b/README.md
index 125a8697..738794c9 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,35 @@
 
-PopSift
-=======
+# PopSift
 
-PopSift is an implementation of the SIFT algorithm in CUDA.
-PopSift tries to stick as closely as possible to David Lowe's famous paper (Lowe, D. G. (2004). Distinctive Image Features from Scale-Invariant Keypoints. International Journal of Computer Vision, 60(2), 91–110. doi:10.1023/B:VISI.0000029664.99615.94), while extracting features from an image in real-time at least on an NVidia GTX 980 Ti GPU.
+[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/3728/badge)](https://bestpractices.coreinfrastructure.org/projects/3728) 
+[![Codacy Badge](https://app.codacy.com/project/badge/Grade/64f9192b53df46b483e7cf5be7e2dddd)](https://app.codacy.com/gh/alicevision/popsift/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade)
 
+PopSift is an open-source implementation of the SIFT algorithm in CUDA.
+PopSift tries to stick as closely as possible to David Lowe's famous paper [1], while extracting features from an image in real-time at least on an NVidia GTX 980 Ti GPU.
 
-Build
------
+Check out the [documentation](https://popsift.readthedocs.io/) for more info.
 
-PopSift has been developed and tested on Linux machines, mostly a variant of Ubuntu, but compiles on MacOSX as well. It comes as a CMake project and requires at least CUDA 7.0 and Boost >= 1.55. It is known to compile and work with NVidia cards of compute capability 3.0 (including the GT 650M), but the code is developed with the compute capability 5.2 card GTX 980 Ti in mind.
+## HW requirements
 
-If you want to avoid building the application you can run cmake with the option `-DPopSift_BUILD_EXAMPLES:BOOL=OFF`.
+PopSift compiles and works with NVidia cards of compute capability >= 3.0 (including the GT 650M), but the code is developed with the compute capability 5.2 card GTX 980 Ti in mind.
+
+CUDA SDK 11 does no longer support compute capability 3.0. 3.5 is still supported with deprecation warning.
+
+## Dependencies
+
+PopSift depends on:
+
+* Host compiler that supports C++14 for CUDA SDK >= 9.0 and C++11 for CUDA SDK 8
+
+* CUDA >= 8.0
+
+Optionally, for the provided applications:
+
+* Boost >= 1.71 (required components {atomic, chrono, date-time, system, thread}-dev)
+
+* DevIL (libdevil-dev) can be used to load a broader range of image formats, otherwise only pgm is supported.
+
+## Build
 
 In order to build the library you can run:
 
@@ -22,23 +40,26 @@ make
 make install
 ```
 
-Continuous integration: 
-- [![Build Status](https://travis-ci.org/alicevision/popsift.svg?branch=master)](https://travis-ci.org/alicevision/popsift) master branch.
-- [![Build Status](https://travis-ci.org/alicevision/popsift.svg?branch=develop)](https://travis-ci.org/alicevision/popsift) develop branch.
+Some build options are available:
 
+* `PopSift_BUILD_EXAMPLES` (default: `ON`) enable building the applications that showcase the use of the library.
 
+* `BUILD_SHARED_LIBS` (default: `ON`) controls the type of library to build (`ON` for shared libraries, `OFF` for static)
 
-Usage
------
+## Usage
 
-Two artifacts are made: `libpopsift` and the test application `popsift-demo`. Calling popsift-demo without parameters shows the options.
+The main artifact created is `libpopsift`.
+If enabled, the test application `popsift-demo` is created as well.
+Calling `popsift-demo` without parameters shows the options.
 
 ### Using PopSift as third party
 
-To integrate PopSift into other software, link with `libpopsift`.  If your are using CMake for building your project you can easily add PopSift to your project. Once you have built and installed PopSift in a directory (say, `<prefix>`), in your `CMakeLists.txt` file just add the dependency
+To integrate PopSift into other software, link with `libpopsift`.
+If your are using CMake for building your project you can easily add PopSift to your project.
+Once you have built and installed PopSift in a directory (say, `<prefix>`), in your `CMakeLists.txt` file just add the dependency
 
 ```cmake
-# Find the package from the PopSiftConfig.cmake 
+# Find the package from the PopSiftConfig.cmake
 # in <prefix>/lib/cmake/PopSift/. Under the namespace PopSift::
 # it exposes the target popsift that allows you to compile
 # and link with the library
@@ -56,29 +77,70 @@ Then, in order to build just pass the location of `PopSiftConfig.cmake` from the
 cmake .. -DPopSift_DIR=<prefix>/lib/cmake/PopSift/
 ```
 
-
-
 ### Calling the API
 
 The caller must create a `popart::Config` struct (documented in `src/sift/sift_conf.h`) to control the behaviour of the PopSift, and instantiate an object of class `PopSift` (found in `src/sift/popsift.h`).
 
-After this, images can be enqueued for SIFT extraction using (`enqueue()`).  The only valid input format is a single plane of grayscale unsigned characters. Only host memory limits the number of images that can be enqueued. The `enqueue` function returns a pointer to a `SiftJob` immediately and performs the feature extraction asynchronously. The memory of the image passed to enqueue remains the caller's responsibility. Calling `SiftJob::get` on the returned job blocks until features are extracted, and returns them.
+After this, images can be enqueued for SIFT extraction using (`enqueue()`).
+A valid input is a single plane of grayscale values located in host memory.
+They can passed as a pointer to unsigned char, with a value range from 0 to 255, or as a pointer to float, with a value range from 0.0f to 1.0f.
 
-Features offer iterators that iterate over objects of type `Feature`. Both classes are documented in `sift_extremum.h`. Each feature represents a feature point in the coordinate system of the input image, providing X and Y coordinates and scale (sigma), as well as several alternative descriptors for the feature point (according to Lowe, 15% of the feature points should be expected to have 2 or more descriptors).
+Only host memory limits the number of images that can be enqueued.
+The `enqueue` function returns a pointer to a `SiftJob` immediately and performs the feature extraction asynchronously.
+The memory of the image passed to enqueue remains the caller's responsibility. Calling `SiftJob::get` on the returned job blocks until features are extracted, and returns them.
+
+Features offer iterators that iterate over objects of type `Feature`.
+Both classes are documented in `sift_extremum.h`.
+Each feature represents a feature point in the coordinate system of the input image, providing X and Y coordinates and scale (sigma), as well as several alternative descriptors for the feature point (according to Lowe, 15% of the feature points should be expected to have 2 or more descriptors).
 
 In an alternate, deprecated, blocking API, `init()` must be called to pass image width and height to PopSift, followed by a call to `executed()` that takes image data and returns the extracted features. `execute()` is synchronous and blocking.
 
-As far as we know, no implementation that is faster than PopSift at the time of PopSift's release comes under a license that allows commercial use and sticks close to the original paper at the same time as well. PopSift can be configured at runtime to use constants that affect it behaviours. In particular, users can choose to generate results very similar to VLFeat or results that are closer (but not as close) to the SIFT implementation of the OpenCV extras. We acknowledge that there is at least one SIFT implementation that is vastly faster, but it makes considerable sacifices in terms of accuracy and compatibility.
+As far as we know, no implementation that is faster than PopSift at the time of PopSift's release comes under a license that allows commercial use and sticks close to the original paper at the same time as well.
+PopSift can be configured at runtime to use constants that affect it behaviours.
+In particular, users can choose to generate results very similar to VLFeat or results that are closer (but not as close) to the SIFT implementation of the OpenCV extras.
+We acknowledge that there is at least one SIFT implementation that is vastly faster, but it makes considerable sacrifices in terms of accuracy and compatibility.
+
+## Continuous integration:
+
+* ![Continuous Integration](https://github.com/alicevision/popsift/workflows/Continuous%20Integration/badge.svg?branch=master) master branch on Linux.
+
+* ![Continuous Integration](https://github.com/alicevision/popsift/workflows/Continuous%20Integration/badge.svg?branch=develop) develop branch on Linux.
+
+* [![Build status](https://ci.appveyor.com/api/projects/status/rsm5269hs288c2ji/branch/develop?svg=true)](https://ci.appveyor.com/project/AliceVision/popsift/branch/develop) develop branch on Windows.
 
+## License
+
+PopSift is licensed under [MPL v2 license](COPYING.md).
+SIFT was patented in the United States from 1999-03-08 to 2020-03-28. See the [patent link](https://patents.google.com/patent/US6711293B1/en) for more information.
+PopSift license only concerns the PopSift source code and does not release users of this code from any requirements that may arise from patents.
+
+
+## Cite Us
+
+If you use PopSift for your publication, please cite us as:
+```bibtex
+@inproceedings{Griwodz2018Popsift,
+	 author = {Griwodz, Carsten and Calvet, Lilian and Halvorsen, P{\aa}l},
+	 title = {Popsift: A Faithful SIFT Implementation for Real-time Applications},
+	 booktitle = {Proceedings of the 9th {ACM} Multimedia Systems Conference},
+	 series = {MMSys '18},
+	 year = {2018},
+	 isbn = {978-1-4503-5192-8},
+	 location = {Amsterdam, Netherlands},
+	 pages = {415--420},
+	 numpages = {6},
+	 doi = {10.1145/3204949.3208136},
+	 acmid = {3208136},
+	 publisher = {ACM},
+	 address = {New York, NY, USA},
+}
+```
 
-License
--------
 
-PopSift is licensed under [MPL v2 license](LICENSE.md).
-However, SIFT is patented in the US and perhaps other countries, and this license does not release users of this code from any requirements that may arise from such patents.
+## Acknowledgements
 
+PopSift was developed within the project [POPART](https://alicevision.org/popart), which has been funded by the [European Commission in the Horizon 2020](https://cordis.europa.eu/project/id/644874) framework.
 
-Authors
--------
+___
 
-It was developed within the project [POPART](http://www.popartproject.eu), which has been funded by the European Commission in the Horizon 2020 framework.
+[1]: Lowe, D. G. (2004). Distinctive Image Features from Scale-Invariant Keypoints. International Journal of Computer Vision, 60(2), 91–110. doi:10.1023/B:VISI.0000029664.99615.94
diff --git a/appveyor.yml b/appveyor.yml
new file mode 100644
index 00000000..679ac5b5
--- /dev/null
+++ b/appveyor.yml
@@ -0,0 +1,56 @@
+#
+# Build system for the PopSift library, including its demo programs.
+#
+version: '1.0.{build}'
+
+image: Visual Studio 2022
+
+platform:
+  - x64
+
+configuration:
+  - Release
+  - Debug
+
+#environment:
+#  matrix:
+#    - DBUILD_SHARED_LIBS: 0
+#    - DBUILD_SHARED_LIBS: 1
+
+#
+# Check the separate file cudaInstallAppveyor for the installation of CUDA
+#
+install:
+    - cmd: >-
+          call cudaInstallAppveyor.cmd
+    - vcpkg upgrade --no-dry-run
+    - vcpkg install
+          boost-system boost-program-options boost-thread boost-filesystem
+          --triplet %PLATFORM%-windows
+          # devil
+
+#
+# When updating to a new version of visual studio, change the generation string after
+# -G and find the suitable toolkit version that is listed after -T (v143 in this case).
+# The CUDA Toolkit and the VS version must match. The matches are found in the CUDA
+# documentation.
+# The platform in this case is x64. Apparently, you need in after -T for VS and after -A
+# for CUDA.
+# You can only have one -T parameter, but you can separate several options with a comma.
+#
+# PopSift_USE_GRID_FILTER is off in this build because the installation of CUDA Thrust
+# in cudaInstallAppveyor is not happening yet.
+#
+before_build:
+    - md build
+    - cd build
+    - cmake -G "Visual Studio 17 2022" -A x64 -T v143,host=x64,cuda="%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5" -DBUILD_SHARED_LIBS:BOOL=ON -DPopSift_USE_NVTX_PROFILING:BOOL=OFF -DPopSift_USE_GRID_FILTER:BOOL=OFF -DPopSift_BUILD_DOCS:BOOL=OFF -DPopSift_USE_POSITION_INDEPENDENT_CODE:BOOL=ON -DPopSift_BUILD_EXAMPLES:BOOL=ON -DCMAKE_BUILD_TYPE=%configuration% -DCMAKE_TOOLCHAIN_FILE=c:/tools/vcpkg/scripts/buildsystems/vcpkg.cmake ..
+    - ls -l
+
+build:
+  verbosity: detailed
+  project: $(APPVEYOR_BUILD_FOLDER)\build\PopSift.sln
+  parallel: true
+
+cache:
+  - c:\tools\vcpkg\installed\
diff --git a/src/cmake/Config.cmake.in b/cmake/Config.cmake.in
similarity index 88%
rename from src/cmake/Config.cmake.in
rename to cmake/Config.cmake.in
index 5aaa2d8e..b8fce13a 100644
--- a/src/cmake/Config.cmake.in
+++ b/cmake/Config.cmake.in
@@ -36,8 +36,11 @@
 #
 ################################################################################
 
-
 @PACKAGE_INIT@
 
-include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
+include(CMakeFindDependencyMacro)
+find_dependency(Threads REQUIRED)
+find_dependency(CUDAToolkit REQUIRED)
+
+include("${CMAKE_CURRENT_LIST_DIR}/@popsift_targets_export_name@.cmake")
 check_required_components("@PROJECT_NAME@")
diff --git a/cmake/FindPopsift.cmake b/cmake/FindPopsift.cmake
deleted file mode 100644
index a9a48119..00000000
--- a/cmake/FindPopsift.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Locate the Popsift libraries.
-#
-# Defines the following variables:
-#
-#   POPSIFT_FOUND        - TRUE if the popsift headers and libs are found
-#   POPSIFT_INCLUDE_DIRS - The path to popsift headers
-#
-#   POPSIFT_LIBRARIES    - Libraries to link against to use popsift.
-#   POPSIFT_LIBRARY_DIR  - The base directory to search for popsift.
-#
-# Accepts the following variables as input:
-#
-#   POPSIFT_DIR - (as a CMake or environment variable)
-#                The root directory of the popsift install prefix
-
-MESSAGE(STATUS "Looking for popsift.")
-
-FIND_PATH(POPSIFT_INCLUDE_DIR popsift/popsift.h
-  HINTS
-    $ENV{POPSIFT_DIR}/include
-    ${POPSIFT_DIR}/include
-  PATH_SUFFIXES
-    popsift
-)
-
-find_package(CUDA 7.0 REQUIRED)
-find_package(Boost 1.53.0 REQUIRED COMPONENTS system filesystem)
-
-IF(POPSIFT_INCLUDE_DIR)
-  MESSAGE(STATUS "popsift headers found in ${POPSIFT_INCLUDE_DIR}")
-ELSE()
-  MESSAGE(STATUS "POPSIFT_INCLUDE_DIR NOT FOUND")
-ENDIF (POPSIFT_INCLUDE_DIR)
-
-FIND_LIBRARY(POPSIFT_LIBRARY NAMES popsift
-  HINTS
-    $ENV{POPSIFT_DIR}
-    ${POPSIFT_DIR}
-  PATH_SUFFIXES
-    lib
-    lib/popsift
-)
-GET_FILENAME_COMPONENT(POPSIFT_LIBRARY_DIR "${POPSIFT_LIBRARY}" PATH)
-
-SET(POPSIFT_LIBRARIES ${POPSIFT_LIBRARY})
-SET(POPSIFT_INCLUDE_DIRS ${POPSIFT_INCLUDE_DIR})
-
-IF(POPSIFT_LIBRARY)
-  MESSAGE(STATUS "popsift libraries found: ${POPSIFT_LIBRARY}")
-  MESSAGE(STATUS "popsift libraries directories: ${POPSIFT_LIBRARY_DIR}")
-ENDIF (POPSIFT_LIBRARY)
-
-include(FindPackageHandleStandardArgs)
-# handle the QUIETLY and REQUIRED arguments and set POPSIFT_FOUND to TRUE
-# if all listed variables are TRUE
-find_package_handle_standard_args(popsift  DEFAULT_MSG
-                                  POPSIFT_LIBRARY POPSIFT_INCLUDE_DIR)
-
-MARK_AS_ADVANCED(POPSIFT_INCLUDE_DIR POPSIFT_LIBRARY)
-
diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake
new file mode 100644
index 00000000..9b9d2b9e
--- /dev/null
+++ b/cmake/FindSphinx.cmake
@@ -0,0 +1,12 @@
+#Look for an executable called sphinx-build
+find_program(SPHINX_EXECUTABLE
+        NAMES sphinx-build
+        HINTS ${SPHINX_ROOT}
+        DOC "Path to sphinx-build executable")
+
+include(FindPackageHandleStandardArgs)
+
+#Handle standard arguments to find_package like REQUIRED and QUIET
+find_package_handle_standard_args(Sphinx
+        "Failed to find sphinx-build executable"
+        SPHINX_EXECUTABLE)
\ No newline at end of file
diff --git a/cmake/sift_config.h.in b/cmake/sift_config.h.in
new file mode 100644
index 00000000..b6807983
--- /dev/null
+++ b/cmake/sift_config.h.in
@@ -0,0 +1,17 @@
+/*
+ * Copyright 2016, Simula Research Laboratory
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#define POPSIFT_IS_DEFINED(F)   F() == 1
+#define POPSIFT_IS_UNDEFINED(F) F() == 0
+
+#define POPSIFT_HAVE_SHFL_DOWN_SYNC()     @PopSift_HAVE_SHFL_DOWN_SYNC@
+#define POPSIFT_DISABLE_GRID_FILTER()     @DISABLE_GRID_FILTER@
+#define POPSIFT_USE_NVTX()                @PopSift_USE_NVTX@
+
diff --git a/cmake/version.hpp.in b/cmake/version.hpp.in
new file mode 100644
index 00000000..4c01ec43
--- /dev/null
+++ b/cmake/version.hpp.in
@@ -0,0 +1,15 @@
+/*
+ * Copyright 2016, Simula Research Laboratory
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#define POPSIFT_VERSION_MAJOR @PROJECT_VERSION_MAJOR@
+#define POPSIFT_VERSION_MINOR @PROJECT_VERSION_MINOR@
+#define POPSIFT_VERSION_PATCH @PROJECT_VERSION_PATCH@
+
+#define POPSIFT_VERSION_STRING "@PROJECT_VERSION@"
\ No newline at end of file
diff --git a/cudaInstallAppveyor.cmd b/cudaInstallAppveyor.cmd
new file mode 100644
index 00000000..9d43f7fa
--- /dev/null
+++ b/cudaInstallAppveyor.cmd
@@ -0,0 +1,44 @@
+@echo off
+echo Downloading CUDA toolkit 12 for Windows 10
+# appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/12.5.1/network_installers/cuda_12.5.1_windows_network.exe -Filename cuda_12.5.1_windows.exe
+
+appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.5.82-archive.zip -Filename cuda_nvcc.zip
+appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.5.82-archive.zip -Filename cuda_cudart.zip
+appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.5.82-archive.zip -Filename cuda_nvtx.zip
+appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.5.82-archive.zip -Filename vs_integration.zip
+dir
+
+echo Unzipping CUDA toolkit 12
+tar -xf cuda_nvcc.zip
+tar -xf cuda_cudart.zip
+tar -xf cuda_nvtx.zip
+tar -xf vs_integration.zip
+dir
+
+echo Making CUDA install dir(s)
+mkdir "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5"
+mkdir "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5\extras"
+
+echo Copying toolkit files to install dir(s)
+xcopy cuda_cudart-windows-x86_64-12.5.82-archive "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5" /s /e /i /y
+xcopy cuda_nvcc-windows-x86_64-12.5.82-archive "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5" /s /e /i /y
+xcopy cuda_nvtx-windows-x86_64-12.5.82-archive "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5" /s /e /i /y
+xcopy visual_studio_integration-windows-x86_64-12.5.82-archive "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5\extras" /s /e /i /y
+
+
+# echo Installing CUDA toolkit 12
+# cuda_12.5.1_windows.exe
+# cuda_9.1.85_windows.exe -s nvcc_12.5 cudart_12.5
+
+
+echo CUDA toolkit 12 installed
+
+dir "%ProgramFiles%"
+
+set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5\libnvvp;%PATH%
+
+dir "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA"
+dir "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5"
+dir "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5\bin"
+
+nvcc -V
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
new file mode 100644
index 00000000..5468184f
--- /dev/null
+++ b/doc/CMakeLists.txt
@@ -0,0 +1,39 @@
+find_package(Doxygen REQUIRED)
+
+file(GLOB_RECURSE ALL_PUBLIC_HEADERS ${PROJECT_SOURCE_DIR}/src/popsift/*.h*)
+set(CCTAG_OTHER_DOC_SOURCES ${PROJECT_SOURCE_DIR}/README.md ${PROJECT_SOURCE_DIR}/INSTALL.md)
+set(DOXYGEN_USE_MDFILE_AS_MAINPAGE ${PROJECT_SOURCE_DIR}/README.md)
+set(DOXYGEN_PROJECT_BRIEF "A faithful implementation of the SIFT algorithm in CUDA.")
+set(DOXYGEN_GENERATE_XML YES)
+set(DOXYGEN_GENERATE_TREEVIEW YES)
+set(DOXYGEN_GENERATE_DEPRECATEDLIST YES)
+set(DOXYGEN_SORT_BRIEF_DOCS YES)
+set(DOXYGEN_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/doxygen)
+set(DOXYGEN_INDEX_FILE ${DOXYGEN_OUTPUT_DIRECTORY}/xml/index.xml)
+
+doxygen_add_docs(doxygen
+        ${ALL_PUBLIC_HEADERS} ${CCTAG_OTHER_DOC_SOURCES}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "Generate the doc")
+
+
+
+find_package(Sphinx REQUIRED)
+
+set(SPHINX_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/source)
+set(SPHINX_BUILD ${CMAKE_CURRENT_BINARY_DIR}/sphinx)
+
+add_custom_target(sphinx ALL
+        COMMAND
+        ${SPHINX_EXECUTABLE} -b html
+        # Tell Breathe where to find the Doxygen output
+        -Dbreathe_projects.PopSift=${DOXYGEN_OUTPUT_DIRECTORY}/xml
+        ${SPHINX_SOURCE} ${SPHINX_BUILD}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS
+        doxygen
+        # Other docs files you want to track should go here (or in some variable)
+        ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/source/index.rst
+        ${DOXYGEN_INDEX_FILE}
+        #        MAIN_DEPENDENCY ${SPHINX_SOURCE}/conf.py
+        COMMENT "Generating documentation with Sphinx")
\ No newline at end of file
diff --git a/doc/sphinx/Makefile b/doc/sphinx/Makefile
new file mode 100644
index 00000000..d0c3cbf1
--- /dev/null
+++ b/doc/sphinx/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/doc/sphinx/make.bat b/doc/sphinx/make.bat
new file mode 100644
index 00000000..6247f7e2
--- /dev/null
+++ b/doc/sphinx/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/doc/sphinx/requirements.txt b/doc/sphinx/requirements.txt
new file mode 100644
index 00000000..45a2e179
--- /dev/null
+++ b/doc/sphinx/requirements.txt
@@ -0,0 +1,4 @@
+sphinx>=1.9.0
+sphinx_rtd_theme
+sphinxcontrib-bibtex
+breathe
diff --git a/doc/sphinx/source/Doxyfile b/doc/sphinx/source/Doxyfile
new file mode 100644
index 00000000..3a42393f
--- /dev/null
+++ b/doc/sphinx/source/Doxyfile
@@ -0,0 +1,267 @@
+DOXYFILE_ENCODING      = UTF-8
+PROJECT_NAME           = PopSift
+PROJECT_NUMBER         = 1.0.0
+PROJECT_BRIEF          = "A faithful implementation of the SIFT algorithm in CUDA."
+PROJECT_LOGO           = 
+OUTPUT_DIRECTORY       = ../build
+CREATE_SUBDIRS         = NO
+ALLOW_UNICODE_NAMES    = NO
+OUTPUT_LANGUAGE        = English
+BRIEF_MEMBER_DESC      = YES
+REPEAT_BRIEF           = YES
+ABBREVIATE_BRIEF       = "The $name class" "The $name widget" "The $name file" is provides specifies contains represents a an the
+ALWAYS_DETAILED_SEC    = NO
+INLINE_INHERITED_MEMB  = NO
+FULL_PATH_NAMES        = YES
+STRIP_FROM_PATH        = 
+STRIP_FROM_INC_PATH    = 
+SHORT_NAMES            = NO
+JAVADOC_AUTOBRIEF      = NO
+QT_AUTOBRIEF           = NO
+MULTILINE_CPP_IS_BRIEF = NO
+INHERIT_DOCS           = YES
+SEPARATE_MEMBER_PAGES  = NO
+TAB_SIZE               = 4
+ALIASES                = 
+TCL_SUBST              = 
+OPTIMIZE_OUTPUT_FOR_C  = NO
+OPTIMIZE_OUTPUT_JAVA   = NO
+OPTIMIZE_FOR_FORTRAN   = NO
+OPTIMIZE_OUTPUT_VHDL   = NO
+EXTENSION_MAPPING      = 
+MARKDOWN_SUPPORT       = YES
+TOC_INCLUDE_HEADINGS   = 0
+AUTOLINK_SUPPORT       = YES
+BUILTIN_STL_SUPPORT    = NO
+CPP_CLI_SUPPORT        = NO
+SIP_SUPPORT            = NO
+IDL_PROPERTY_SUPPORT   = YES
+DISTRIBUTE_GROUP_DOC   = NO
+GROUP_NESTED_COMPOUNDS = NO
+SUBGROUPING            = YES
+INLINE_GROUPED_CLASSES = NO
+INLINE_SIMPLE_STRUCTS  = NO
+TYPEDEF_HIDES_STRUCT   = NO
+LOOKUP_CACHE_SIZE      = 0
+EXTRACT_ALL            = NO
+EXTRACT_PRIVATE        = NO
+EXTRACT_PACKAGE        = NO
+EXTRACT_STATIC         = NO
+EXTRACT_LOCAL_CLASSES  = YES
+EXTRACT_LOCAL_METHODS  = NO
+EXTRACT_ANON_NSPACES   = NO
+HIDE_UNDOC_MEMBERS     = NO
+HIDE_UNDOC_CLASSES     = NO
+HIDE_FRIEND_COMPOUNDS  = NO
+HIDE_IN_BODY_DOCS      = NO
+INTERNAL_DOCS          = NO
+CASE_SENSE_NAMES       = YES
+HIDE_SCOPE_NAMES       = NO
+HIDE_COMPOUND_REFERENCE= NO
+SHOW_INCLUDE_FILES     = YES
+SHOW_GROUPED_MEMB_INC  = NO
+FORCE_LOCAL_INCLUDES   = NO
+INLINE_INFO            = YES
+SORT_MEMBER_DOCS       = YES
+SORT_BRIEF_DOCS        = NO
+SORT_MEMBERS_CTORS_1ST = NO
+SORT_GROUP_NAMES       = NO
+SORT_BY_SCOPE_NAME     = NO
+STRICT_PROTO_MATCHING  = NO
+GENERATE_TODOLIST      = YES
+GENERATE_TESTLIST      = YES
+GENERATE_BUGLIST       = YES
+GENERATE_DEPRECATEDLIST= YES
+ENABLED_SECTIONS       = 
+MAX_INITIALIZER_LINES  = 30
+SHOW_USED_FILES        = YES
+SHOW_FILES             = YES
+SHOW_NAMESPACES        = YES
+FILE_VERSION_FILTER    = 
+LAYOUT_FILE            = 
+CITE_BIB_FILES         = 
+QUIET                  = NO
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = YES
+WARN_IF_DOC_ERROR      = YES
+WARN_NO_PARAMDOC       = NO
+WARN_AS_ERROR          = NO
+WARN_FORMAT            = "$file:$line: $text"
+WARN_LOGFILE           = 
+INPUT                  = ../../../src
+INPUT_ENCODING         = UTF-8
+FILE_PATTERNS          = *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.idl *.ddl *.odl *.h *.hh *.hxx *.hpp *.h++ *.cs *.d *.php *.php4 *.php5 *.phtml *.inc *.m *.markdown *.md *.mm *.dox *.py *.pyw *.f90 *.f95 *.f03 *.f08 *.f *.for *.tcl *.vhd *.vhdl *.ucf *.qsf
+RECURSIVE              = YES
+EXCLUDE                = 
+EXCLUDE_SYMLINKS       = NO
+EXCLUDE_PATTERNS       = */.git/* */.svn/* */.hg/* */CMakeFiles/* */_CPack_Packages/* DartConfiguration.tcl CMakeLists.txt CMakeCache.txt
+EXCLUDE_SYMBOLS        = 
+EXAMPLE_PATH           = 
+EXAMPLE_PATTERNS       = *
+EXAMPLE_RECURSIVE      = NO
+IMAGE_PATH             = 
+INPUT_FILTER           = 
+FILTER_PATTERNS        = 
+FILTER_SOURCE_FILES    = NO
+FILTER_SOURCE_PATTERNS = 
+SOURCE_BROWSER         = NO
+INLINE_SOURCES         = NO
+STRIP_CODE_COMMENTS    = YES
+REFERENCED_BY_RELATION = NO
+REFERENCES_RELATION    = NO
+REFERENCES_LINK_SOURCE = YES
+SOURCE_TOOLTIPS        = YES
+USE_HTAGS              = NO
+VERBATIM_HEADERS       = YES
+CLANG_ASSISTED_PARSING = NO
+CLANG_OPTIONS          = 
+ALPHABETICAL_INDEX     = YES
+COLS_IN_ALPHA_INDEX    = 5
+IGNORE_PREFIX          = 
+GENERATE_HTML          = NO
+HTML_OUTPUT            = html
+HTML_FILE_EXTENSION    = .html
+HTML_HEADER            = 
+HTML_FOOTER            = 
+HTML_STYLESHEET        = 
+HTML_EXTRA_STYLESHEET  = 
+HTML_EXTRA_FILES       = 
+HTML_COLORSTYLE_HUE    = 220
+HTML_COLORSTYLE_SAT    = 100
+HTML_COLORSTYLE_GAMMA  = 80
+HTML_TIMESTAMP         = NO
+HTML_DYNAMIC_SECTIONS  = NO
+HTML_INDEX_NUM_ENTRIES = 100
+GENERATE_DOCSET        = NO
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+DOCSET_PUBLISHER_NAME  = Publisher
+GENERATE_HTMLHELP      = NO
+CHM_FILE               = 
+HHC_LOCATION           = 
+GENERATE_CHI           = NO
+CHM_INDEX_ENCODING     = 
+BINARY_TOC             = NO
+TOC_EXPAND             = NO
+GENERATE_QHP           = NO
+QCH_FILE               = 
+QHP_NAMESPACE          = org.doxygen.Project
+QHP_VIRTUAL_FOLDER     = doc
+QHP_CUST_FILTER_NAME   = 
+QHP_CUST_FILTER_ATTRS  = 
+QHP_SECT_FILTER_ATTRS  = 
+QHG_LOCATION           = 
+GENERATE_ECLIPSEHELP   = NO
+ECLIPSE_DOC_ID         = org.doxygen.Project
+DISABLE_INDEX          = NO
+GENERATE_TREEVIEW      = YES
+ENUM_VALUES_PER_LINE   = 4
+TREEVIEW_WIDTH         = 250
+EXT_LINKS_IN_WINDOW    = NO
+FORMULA_FONTSIZE       = 10
+FORMULA_TRANSPARENT    = YES
+USE_MATHJAX            = NO
+MATHJAX_FORMAT         = HTML-CSS
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+MATHJAX_EXTENSIONS     = 
+MATHJAX_CODEFILE       = 
+SEARCHENGINE           = YES
+SERVER_BASED_SEARCH    = NO
+EXTERNAL_SEARCH        = NO
+SEARCHENGINE_URL       = 
+SEARCHDATA_FILE        = searchdata.xml
+EXTERNAL_SEARCH_ID     = 
+EXTRA_SEARCH_MAPPINGS  = 
+GENERATE_LATEX         = NO
+LATEX_OUTPUT           = latex
+LATEX_CMD_NAME         = latex
+MAKEINDEX_CMD_NAME     = makeindex
+COMPACT_LATEX          = NO
+PAPER_TYPE             = a4
+EXTRA_PACKAGES         = 
+LATEX_HEADER           = 
+LATEX_FOOTER           = 
+LATEX_EXTRA_STYLESHEET = 
+LATEX_EXTRA_FILES      = 
+PDF_HYPERLINKS         = YES
+USE_PDFLATEX           = YES
+LATEX_BATCHMODE        = NO
+LATEX_HIDE_INDICES     = NO
+LATEX_SOURCE_CODE      = NO
+LATEX_BIB_STYLE        = plain
+LATEX_TIMESTAMP        = NO
+GENERATE_RTF           = NO
+RTF_OUTPUT             = rtf
+COMPACT_RTF            = NO
+RTF_HYPERLINKS         = NO
+RTF_STYLESHEET_FILE    = 
+RTF_EXTENSIONS_FILE    = 
+RTF_SOURCE_CODE        = NO
+GENERATE_MAN           = NO
+MAN_OUTPUT             = man
+MAN_EXTENSION          = .3
+MAN_SUBDIR             = 
+MAN_LINKS              = NO
+GENERATE_XML           = YES
+XML_OUTPUT             = xml
+XML_PROGRAMLISTING     = YES
+GENERATE_DOCBOOK       = NO
+DOCBOOK_OUTPUT         = docbook
+DOCBOOK_PROGRAMLISTING = NO
+GENERATE_AUTOGEN_DEF   = NO
+GENERATE_PERLMOD       = NO
+PERLMOD_LATEX          = NO
+PERLMOD_PRETTY         = YES
+PERLMOD_MAKEVAR_PREFIX = 
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = NO
+EXPAND_ONLY_PREDEF     = NO
+SEARCH_INCLUDES        = YES
+INCLUDE_PATH           = 
+INCLUDE_FILE_PATTERNS  = 
+PREDEFINED             = 
+EXPAND_AS_DEFINED      = 
+SKIP_FUNCTION_MACROS   = YES
+TAGFILES               = 
+GENERATE_TAGFILE       = 
+ALLEXTERNALS           = NO
+EXTERNAL_GROUPS        = YES
+EXTERNAL_PAGES         = YES
+PERL_PATH              = /usr/bin/perl
+CLASS_DIAGRAMS         = YES
+MSCGEN_PATH            = 
+DIA_PATH               = 
+HIDE_UNDOC_RELATIONS   = YES
+HAVE_DOT               = YES
+DOT_NUM_THREADS        = 0
+DOT_FONTNAME           = Helvetica
+DOT_FONTSIZE           = 10
+DOT_FONTPATH           = 
+CLASS_GRAPH            = YES
+COLLABORATION_GRAPH    = YES
+GROUP_GRAPHS           = YES
+UML_LOOK               = NO
+UML_LIMIT_NUM_FIELDS   = 10
+TEMPLATE_RELATIONS     = NO
+INCLUDE_GRAPH          = YES
+INCLUDED_BY_GRAPH      = YES
+CALL_GRAPH             = NO
+CALLER_GRAPH           = NO
+GRAPHICAL_HIERARCHY    = YES
+DIRECTORY_GRAPH        = YES
+DOT_IMAGE_FORMAT       = png
+INTERACTIVE_SVG        = NO
+DOT_PATH               = /usr/bin
+DOTFILE_DIRS           = 
+MSCFILE_DIRS           = 
+DIAFILE_DIRS           = 
+PLANTUML_JAR_PATH      = 
+PLANTUML_CFG_FILE      = 
+PLANTUML_INCLUDE_PATH  = 
+DOT_GRAPH_MAX_NODES    = 50
+MAX_DOT_GRAPH_DEPTH    = 0
+DOT_TRANSPARENT        = NO
+DOT_MULTI_TARGETS      = YES
+GENERATE_LEGEND        = YES
+DOT_CLEANUP            = YES
diff --git a/doc/sphinx/source/about/about.rst b/doc/sphinx/source/about/about.rst
new file mode 100644
index 00000000..52e76402
--- /dev/null
+++ b/doc/sphinx/source/about/about.rst
@@ -0,0 +1,53 @@
+About
+=====
+
+
+
+License
+~~~~~~~
+
+PopSift is licensed under `MPLv2 license <https://www.mozilla.org/en-US/MPL/2.0/>`_.
+
+More info about the license and what you can do with the code can be found at `tldrlegal website <https://tldrlegal.com/license/mozilla-public-license-2.0-(mpl-2)>`_
+
+SIFT was patented in the United States from 1999-03-08 to 2020-03-28.
+See the `patent link <https://patents.google.com/patent/US6711293B1/en>`_ for more information.
+PopSift license only concerns the PopSift source code and does not release users of this code from any requirements that may arise from patents.
+
+Contact us
+~~~~~~~~~~
+
+You can contact us on the public mailing list at
+`alicevision@googlegroups.com <mailto:alicevision@googlegroups.com>`_
+
+You can also contact us privately at
+`alicevision-team@googlegroups.com <mailto:alicevision-team@googlegroups.com>`_
+
+
+Cite us
+~~~~~~~
+
+If you want to cite this work in your publication, please use the following
+
+.. code:: bibtex
+
+    @inproceedings{Griwodz2018Popsift,
+        author = {Griwodz, Carsten and Calvet, Lilian and Halvorsen, P{\aa}l},
+        title = {Popsift: A Faithful SIFT Implementation for Real-time Applications},
+        booktitle = {Proceedings of the 9th {ACM} Multimedia Systems Conference},
+        series = {MMSys '18},
+        year = {2018},
+        isbn = {978-1-4503-5192-8},
+        location = {Amsterdam, Netherlands},
+        pages = {415--420},
+        numpages = {6},
+        doi = {10.1145/3204949.3208136},
+        acmid = {3208136},
+        publisher = {ACM},
+        address = {New York, NY, USA},
+    }
+
+Acknowledgements
+~~~~~~~~~~~~~~~~
+
+This has been developed in the context of the `European project POPART <https://alicevision.org/popart/>`_ founded by European Union’s Horizon 2020 research and innovation programme under `grant agreement No 644874 <https://cordis.europa.eu/project/id/644874>`_.
\ No newline at end of file
diff --git a/doc/sphinx/source/api/api.rst b/doc/sphinx/source/api/api.rst
new file mode 100644
index 00000000..5e9ef3cc
--- /dev/null
+++ b/doc/sphinx/source/api/api.rst
@@ -0,0 +1,25 @@
+API References
+==============
+
+
+Main Classes
+~~~~~~~~~~~~
+
+.. doxygenclass:: SiftJob
+   :members:
+
+.. doxygenclass:: PopSift
+   :members:
+
+.. doxygenstruct:: popsift::Config
+   :members:
+
+
+Functions
+~~~~~~~~~
+
+
+
+
+Utility Classes
+~~~~~~~~~~~~~~~
diff --git a/doc/sphinx/source/api/usage.rst b/doc/sphinx/source/api/usage.rst
new file mode 100644
index 00000000..b226d240
--- /dev/null
+++ b/doc/sphinx/source/api/usage.rst
@@ -0,0 +1,13 @@
+Library usage
+=============
+
+
+
+
+Detection
+~~~~~~~~~
+
+
+
+
+
diff --git a/doc/sphinx/source/biblio.bib b/doc/sphinx/source/biblio.bib
new file mode 100644
index 00000000..f57013dd
--- /dev/null
+++ b/doc/sphinx/source/biblio.bib
@@ -0,0 +1,25 @@
+@inproceedings{Griwodz2018Popsift,
+    author = {Griwodz, Carsten and Calvet, Lilian and Halvorsen, P{\aa}l},
+    title = {Popsift: A Faithful SIFT Implementation for Real-time Applications},
+    booktitle = {Proceedings of the 9th {ACM} Multimedia Systems Conference},
+    series = {MMSys '18},
+    year = {2018},
+    isbn = {978-1-4503-5192-8},
+    location = {Amsterdam, Netherlands},
+    pages = {415--420},
+    numpages = {6},
+    doi = {10.1145/3204949.3208136},
+    acmid = {3208136},
+    publisher = {ACM},
+    address = {New York, NY, USA},
+}
+
+@article{Lowe2003,
+    author = {Lowe, DG},
+    doi = {10.1023/B:VISI.0000029664.99615.94},
+    file = {:home/alcov/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Lowe - 2004 - Distinctive image features from scale-invariant keypoints.pdf:pdf},
+    journal = {International journal of computer vision},
+    pages = {1--29},
+    title = {{Distinctive image features from scale-invariant keypoints}},
+    year = {2004}
+}
diff --git a/doc/sphinx/source/bibliography.rst b/doc/sphinx/source/bibliography.rst
new file mode 100644
index 00000000..3e04dc7d
--- /dev/null
+++ b/doc/sphinx/source/bibliography.rst
@@ -0,0 +1,5 @@
+Bibliography
+============
+
+.. bibliography:: biblio.bib
+   :all:
\ No newline at end of file
diff --git a/doc/sphinx/source/conf.py b/doc/sphinx/source/conf.py
new file mode 100644
index 00000000..514bafe2
--- /dev/null
+++ b/doc/sphinx/source/conf.py
@@ -0,0 +1,85 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+import subprocess, os
+
+def configure_doxyfile(input_dir, output_dir):
+    with open('Doxyfile.in', 'r') as file :
+        filedata = file.read()
+
+    filedata = filedata.replace('@DOXYGEN_INPUT_DIR@', input_dir)
+    filedata = filedata.replace('@DOXYGEN_OUTPUT_DIR@', output_dir)
+
+    with open('Doxyfile', 'w') as file:
+        file.write(filedata)
+
+# Check if we're running on Read the Docs' servers
+read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
+
+breathe_projects = {}
+
+if read_the_docs_build:
+    # run doxygen before to generate the xml
+    output_dir = '../build'
+    subprocess.call('doxygen', shell=True)
+    breathe_projects['PopSift'] = output_dir + '/xml'
+
+
+
+
+# -- Project information -----------------------------------------------------
+
+project = u'PopSift'
+copyright = '2020, AliceVision'
+author = 'AliceVision'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['breathe', 'sphinxcontrib.bibtex']
+bibtex_bibfiles = ['biblio.bib']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+source_suffix = ['.rst', '.md']
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# breathe_projects = {
+#     "PopSift": "../../doxygen/xml/",
+# }
+
+# Breathe Configuration
+breathe_default_project = 'PopSift'
\ No newline at end of file
diff --git a/doc/sphinx/source/index.rst b/doc/sphinx/source/index.rst
new file mode 100644
index 00000000..457d54ca
--- /dev/null
+++ b/doc/sphinx/source/index.rst
@@ -0,0 +1,36 @@
+PopSift Library
+=============
+
+PopSift is an open-source implementation of the SIFT algorithm in CUDA :cite:`Griwodz2018Popsift`.
+PopSift tries to stick as closely as possible to David Lowe's famous paper :cite:`Lowe2003`, while extracting features from an image in real-time at least on an NVidia GTX 980 Ti GPU.
+
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+   :caption: Install
+
+   install/install
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+   :caption: API Documentation
+
+   api/usage
+   api/api
+
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+   :caption: About
+
+   about/about
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+   :caption: References
+
+   bibliography
diff --git a/doc/sphinx/source/install/install.rst b/doc/sphinx/source/install/install.rst
new file mode 100644
index 00000000..da6d1dde
--- /dev/null
+++ b/doc/sphinx/source/install/install.rst
@@ -0,0 +1,261 @@
+Requirements
+============
+
+Hardware
+~~~~~~~~
+
+PopSift is a GPU implementation that requires an NVIDIA GPU card with a CUDA compute capability >= 3.0 (including, e.g. the GT 650M).
+The code is originally developed with the compute capability 5.2 card GTX 980 Ti in mind.
+
+You can check your `NVIDIA GPU card CC support here <https://github.com/tpruvot/ccminer/wiki/Compatibility>`_ or on the `NVIDIA dev page <https://developer.nvidia.com/cuda-gpus>`_.
+If you do not have a NVIDIA card you will still able to compile and use the CPU version.
+
+Here are the minimum hardware requirements for PopSift:
+
++--------------------------------------------------------------------------+
+| Minimum requirements                                                     |
++===================+======================================================+
+| Operating systems | Windows x64, Linux, macOS                            |
++-------------------+------------------------------------------------------+
+| CPU               | Recent Intel or AMD cpus                             |
++-------------------+------------------------------------------------------+
+| RAM Memory        | 8 GB                                                 |
++-------------------+------------------------------------------------------+
+| Hard Drive        | No particular requirements                           |
++-------------------+------------------------------------------------------+
+| GPU               | NVIDIA CUDA-enabled GPU (compute capability >= 3.5)  |
++-------------------+------------------------------------------------------+
+
+
+
+Software
+~~~~~~~~
+
+The core library depends only on Cuda >= 7.0
+
+The library includes a few sample applications that show how to use the library.
+They require
+
+* Boost >= 1.55 (required components atomic, chrono, date-time, system, thread)
+
+* [optionally] DevIL (libdevil-dev) can be used to load a broader range of image formats, otherwise only pgm is supported.
+
+
+
+------------
+
+
+vcpkg
+=====
+
+`vcpkg <https://github.com/microsoft/vcpkg>`_ is a cross-platform (Windows, Linux and MacOS), open-source package manager created by Microsoft.
+
+Starting from v0.9, PopSift package can be installed on each platform via vcpkg.
+To install the library:
+
+.. code:: shell
+
+  vcpkg install popsift --triplet <arch>
+
+where :code:`<arch>` is the architecture to build for (e.g. :code:`x64-windows`, :code:`x64-linux-dynamic` etc.)
+
+If you want to install the demo applications that come with the library you can add the option :code:`apps`:
+
+.. code:: shell
+
+  vcpkg install popsift[apps] --triplet <arch>
+
+------------
+
+Building the library
+====================
+
+Building tools
+~~~~~~~~~~~~~~
+
+Required tools:
+
+* CMake >= 3.14 to build the code
+* Git
+* C/C++ compiler supporting the C++11 standard (gcc >= 4.6 or visual studio or clang)
+* CUDA >= 7.0
+
+
+
+Dependencies
+~~~~~~~~~~~~
+
+vcpkg
++++++
+
+vcpkg can be used to install all the dependencies on all the supported platforms.
+This is particularly useful on Windows.
+To install the dependencies:
+
+.. code:: shell
+
+  vcpkg install cuda devil boost-system boost-program-options boost-thread boost-filesystem
+
+You can add the flag :code:`--triplet` to specify the architecture and the version you want to build.
+For example:
+
+* :code:`--triplet x64-windows` will build the dynamic version for Windows 64 bit
+
+* :code:`--triplet x64-windows-static` will build the static version for Windows 64 bit
+
+* :code:`--triplet x64-linux-dynamic` will build the dynamic version for Linux 64 bit
+
+and so on.
+More information can be found `here <https://vcpkg.readthedocs.io/en/latest/examples/overlay-triplets-linux-dynamic>`_
+
+Linux
++++++
+
+On Linux you can install from the package manager:
+
+For Ubuntu/Debian package system:
+
+.. code:: shell
+
+    sudo apt-get install g++ git-all libboost-all-dev libdevil-dev
+
+
+For CentOS package system:
+
+.. code:: shell
+
+    sudo yum install gcc-c++ git boost-devel devil
+
+
+MacOS
++++++
+
+On MacOs using `Homebrew <https://brew.sh/>`_ install the following packages:
+
+.. code:: shell
+
+    brew install git boost devil
+
+
+Getting the sources
+~~~~~~~~~~~~~~~~~~~~
+
+.. code:: shell
+
+   git clone https://github.com/alicevision/PopSift.git
+
+
+CMake configuration
+~~~~~~~~~~~~~~~~~~~
+
+From PopSift root folder you can run cmake:
+
+.. code:: shell
+
+    mkdir build && cd build
+    cmake ..
+    make -j `nproc`
+
+On Windows add :code:`-G "Visual Studio 16 2019" -A x64` to generate the Visual Studio solution according to your VS version (`see CMake documentation <https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html#ide-build-tool-generators>`_).
+
+If you are using the dependencies built with VCPKG you need to pass :code:`-DCMAKE_TOOLCHAIN_FILE=path/to/vcpkg/scripts/buildsystems/vcpkg.cmake` at cmake step to let it know where to find the dependencies.
+
+
+CMake options
++++++++++++++
+
+CMake configuration can be controlled by changing the values of the following variables (here with their default value)
+
+
+* :code:`BUILD_SHARED_LIBS:BOOL=ON` to enable/disable the building shared libraries
+
+* :code:`PopSift_BUILD_EXAMPLES:BOOL=ON` to enable/disable the building of applications
+
+* :code:`PopSift_BUILD_DOC:BOOL=OFF` to enable/disable building this documentation and the Doxygen one.
+
+For example, if you do not want to build the applications, you have to pass :code:`-DPopSift_BUILD_EXAMPLES:BOOL=OFF` and so on.
+
+
+------------
+
+
+PopSift as third party
+====================
+
+When you install PopSift a file :code:`PopSiftConfig.cmake` is installed in :code:`<install_prefix>/lib/cmake/PopSift/` that allows you to import the library in your CMake project.
+In your :code:`CMakeLists.txt` file you can add the dependency in this way:
+
+.. code-block::
+  :linenos:
+
+  # Find the package from the PopSiftConfig.cmake
+  # in <prefix>/lib/cmake/PopSift/. Under the namespace PopSift::
+  # it exposes the target PopSift that allows you to compile
+  # and link with the library
+  find_package(PopSift CONFIG REQUIRED)
+  ...
+  # suppose you want to try it out in a executable
+  add_executable(popsiftTest yourfile.cpp)
+  # add link to the library
+  target_link_libraries(popsiftTest PUBLIC PopSift::PopSift)
+
+Then, in order to build just pass the location of :code:`PopSiftConfig.cmake` from the cmake command line:
+
+.. code:: shell
+
+    cmake .. -DPopSift_DIR=<install_prefix>/lib/cmake/PopSift/
+
+
+------------
+
+
+
+Docker image
+============
+
+A docker image can be built using the Ubuntu based :code:`Dockerfile`, which is based on nvidia/cuda image (https://hub.docker.com/r/nvidia/cuda/ )
+
+
+Building the dependency image
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We provide a :code:`Dockerfile_deps` containing a cuda image with all the necessary PopSift dependencies installed.
+
+A parameter :code:`CUDA_TAG` can be passed when building the image to select the cuda version.
+Similarly, :code:`OS_TAG` can be passed to select the Ubuntu version.
+By default, :code:`CUDA_TAG=10.2` and :code:`OS_TAG=18.04`
+
+For example to create the dependency image based on ubuntu 18.04 with cuda 8.0 for development, use
+
+.. code:: shell
+
+    docker build --build-arg CUDA_TAG=8.0 --tag alicevision/popsift-deps:cuda8.0-ubuntu18.04 -f Dockerfile_deps .
+
+The complete list of available tags can be found on the nvidia [dockerhub page](https://hub.docker.com/r/nvidia/cuda/)
+
+
+Building the PopSift image
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Once you built the dependency image, you can build the popsift image in the same manner using :code:`Dockerfile`:
+
+.. code:: shell
+
+    docker build --tag alicevision/popsift:cuda8.0-ubuntu18.04 .
+
+
+Running the PopSift image
+~~~~~~~~~~~~~~~~~~~~~~~
+
+In order to run the image nvidia docker is needed: see the `installation instruction <https://github.com/nvidia/nvidia-docker/wiki/Installation-(version-2.0)>`_.
+Once installed, the docker can be run, e.g., in interactive mode with
+
+.. code:: shell
+
+    docker run -it --runtime=nvidia alicevision/popsift:cuda8.0-ubuntu18.04
+
+
+Official images on DockeHub
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Check the docker hub `PopSift repository <https://hub.docker.com/repository/docker/alicevision/popsift>`_ for the available images.
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
old mode 100755
new mode 100644
index e12a32f9..ff3b3681
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,120 +1,157 @@
-set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR})
-
-CUDA_INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS})
-
-CUDA_ADD_LIBRARY(popsift STATIC
-	popsift/popsift.cu popsift/popsift.h
-	popsift/features.cu popsift/features.h
-	popsift/sift_constants.cu popsift/sift_constants.h
-	popsift/sift_conf.cu popsift/sift_conf.h
-	popsift/gauss_filter.cu popsift/gauss_filter.h
-	popsift/s_image.cu popsift/s_image.h
-	popsift/sift_pyramid.cu popsift/sift_pyramid.h
-	popsift/sift_octave.cu popsift/sift_octave.h
-	popsift/s_pyramid_build.cu
-	popsift/s_pyramid_build_aa.cu popsift/s_pyramid_build_aa.h
-	popsift/s_pyramid_build_ai.cu popsift/s_pyramid_build_ai.h
-	popsift/s_pyramid_build_ra.cu popsift/s_pyramid_build_ra.h
-	popsift/s_pyramid_fixed.cu
-	popsift/sift_extremum.h
-	popsift/sift_extremum.cu popsift/s_extrema.cu
-	popsift/s_orientation.cu
+# Do not specify SHARED or STATIC in add_library. Let the variable BUILD_SHARED_LIBS determine this.
+
+add_library(popsift
+        popsift/popsift.cu popsift/popsift.h
+        popsift/features.cu popsift/features.h
+        popsift/sift_constants.cu popsift/sift_constants.h
+        popsift/sift_conf.cu popsift/sift_conf.h
+        popsift/gauss_filter.cu popsift/gauss_filter.h
+        popsift/s_image.cu popsift/s_image.h
+        popsift/sift_pyramid.cu popsift/sift_pyramid.h
+        popsift/sift_octave.cu popsift/sift_octave.h
+        popsift/s_pyramid_build.cu
+        popsift/s_pyramid_build_aa.cu popsift/s_pyramid_build_aa.h
+        popsift/s_pyramid_build_ai.cu popsift/s_pyramid_build_ai.h
+        popsift/s_pyramid_build_ra.cu popsift/s_pyramid_build_ra.h
+        popsift/s_pyramid_fixed.cu
+        popsift/sift_extremum.h
+        popsift/sift_extremum.cu popsift/s_extrema.cu
+        popsift/s_orientation.cu
         popsift/s_filtergrid.cu
-	popsift/sift_desc.cu
-	popsift/s_desc_loop.cu popsift/s_desc_loop.h
-	popsift/s_desc_iloop.cu popsift/s_desc_iloop.h
-	popsift/s_desc_grid.cu popsift/s_desc_grid.h
-	popsift/s_desc_igrid.cu popsift/s_desc_igrid.h
-	popsift/s_desc_notile.cu popsift/s_desc_notile.h
-	popsift/s_desc_norm_rs.h
-	popsift/s_desc_norm_l2.h
-	popsift/s_desc_normalize.h
-	popsift/s_gradiant.h
-	popsift/s_solve.h
-	popsift/common/assist.cu popsift/common/assist.h
-	popsift/common/clamp.h
-	popsift/common/plane_2d.cu popsift/common/plane_2d.h
-	popsift/common/write_plane_2d.cu popsift/common/write_plane_2d.h
-	popsift/common/debug_macros.cu popsift/common/debug_macros.h
-	popsift/common/device_prop.cu popsift/common/device_prop.h
-	popsift/common/warp_bitonic_sort.h
-	popsift/common/excl_blk_prefix_sum.h
-	popsift/common/vec_macros.h
-	popsift/common/clamp.h )
-
-# BUILD_INTERFACE allows to include the directory with source only when target is
-# built in the building tree (ie, not from an install location)
-target_include_directories(popsift 
-            PUBLIC ${Boost_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS}
-            "$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>")
-
+        popsift/sift_desc.cu
+        popsift/s_desc_loop.cu popsift/s_desc_loop.h
+        popsift/s_desc_iloop.cu popsift/s_desc_iloop.h
+        popsift/s_desc_grid.cu popsift/s_desc_grid.h
+        popsift/s_desc_igrid.cu popsift/s_desc_igrid.h
+        popsift/s_desc_notile.cu popsift/s_desc_notile.h
+        popsift/s_desc_norm_rs.h
+        popsift/s_desc_norm_l2.h
+        popsift/s_desc_normalize.h
+        popsift/s_gradiant.h
+        popsift/s_solve.h
+        popsift/common/assist.cu popsift/common/assist.h
+        popsift/common/clamp.h
+        popsift/common/plane_2d.cu popsift/common/plane_2d.h
+        popsift/common/write_plane_2d.cu popsift/common/write_plane_2d.h
+        popsift/common/debug_macros.cu popsift/common/debug_macros.h
+        popsift/common/device_prop.cu popsift/common/device_prop.h
+        popsift/common/warp_bitonic_sort.h
+        popsift/common/excl_blk_prefix_sum.h
+        popsift/common/vec_macros.h
+        popsift/common/clamp.h)
+
+target_link_libraries(popsift
+                      PUBLIC
+		      CUDA::cudart
+		      Threads::Threads)
+
+if(PopSift_USE_NVTX_PROFILING)
+target_link_libraries(popsift
+                      PUBLIC
+		      CUDA::nvtx3)
+endif()
 
 set_target_properties(popsift PROPERTIES VERSION ${PROJECT_VERSION})
 set_target_properties(popsift PROPERTIES DEBUG_POSTFIX "d")
+set_target_properties(popsift PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
-# cannot use PRIVATE here as there is a bug in FindCUDA and CUDA_ADD_LIBRARY
-# https://gitlab.kitware.com/cmake/cmake/issues/16097
- target_link_libraries(popsift ${Boost_LIBRARIES} ${CUDA_CUDADEVRT_LIBRARY})
+# build directory containing the automatically generated files
+set(popsift_generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
 
+# BUILD_INTERFACE allows to include the directory with source only when target is
+# built in the building tree (ie, not from an install location)
+# The CUDA install dir variable has changed from the old CUDA_INCLUDE_DIRS to the new CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES
+target_include_directories(popsift
+            PUBLIC
+            $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>
+            $<BUILD_INTERFACE:${popsift_generated_dir}>
+            $<BUILD_INTERFACE:${popsift_generated_dir}/popsift>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/>
+	    ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+	    )
 
 # EXPORTING THE LIBRARY
 #
 # place to put the cmake-related files
-set(config_install_dir "lib/cmake/${PROJECT_NAME}")
+set(popsift_config_install_dir "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
 # include directory for install
-set(include_install_dir "include")
+set(popsift_include_install_dir "${CMAKE_INSTALL_INCLUDEDIR}")
+# the name for the generated header version file
+set(popsift_version_header_name "${popsift_generated_dir}/popsift/version.hpp")
+# the name for the generated config.hpp
+set(popsift_config_header_name "${popsift_generated_dir}/popsift/sift_config.h")
+# where to install the generated files
+set(popsift_install_dir_generated "${popsift_include_install_dir}/popsift")
 
-# build directory containing the generated files
-set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
 
 # Configuration
-set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
-set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
-set(targets_export_name "${PROJECT_NAME}Targets")
-set(namespace "${PROJECT_NAME}::")
+set(popsift_cmake_version_config "${popsift_generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
+set(popsift_cmake_project_config "${popsift_generated_dir}/${PROJECT_NAME}Config.cmake")
+set(popsift_targets_export_name "${PROJECT_NAME}Targets")
+set(popsift_namespace "${PROJECT_NAME}::")
 
 # Include module with fuction 'write_basic_package_version_file'
 include(CMakePackageConfigHelpers)
 
 # Configure '<PROJECT-NAME>ConfigVersion.cmake'
 # Note: major version number must be the same as requested
-write_basic_package_version_file("${version_config}" COMPATIBILITY SameMajorVersion)
+write_basic_package_version_file("${popsift_cmake_version_config}" COMPATIBILITY SameMajorVersion)
 
 # Configure '<PROJECT-NAME>Config.cmake'
 # Use variables:
-#   * targets_export_name
+#   * popsift_targets_export_name
 #   * PROJECT_NAME
-configure_package_config_file("cmake/Config.cmake.in"
-                              "${project_config}"
-                              INSTALL_DESTINATION "${config_install_dir}")
+configure_package_config_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in"
+                              "${popsift_cmake_project_config}"
+                              INSTALL_DESTINATION "${popsift_config_install_dir}")
+
+# version file
+configure_file("${PROJECT_SOURCE_DIR}/cmake/version.hpp.in" ${popsift_version_header_name} @ONLY)
+# config file
+configure_file("${PROJECT_SOURCE_DIR}/cmake/sift_config.h.in" ${popsift_config_header_name} @ONLY)
+
 
 # Targets:
 #   * <prefix>/lib/libpopsift.a
 #   * header location after install: <prefix>/include/
 #   * headers can be included by C++ code `#include <popsift/popsift.h>`
 install(TARGETS popsift
-        EXPORT "${targets_export_name}"
-        LIBRARY DESTINATION "lib"
-        ARCHIVE DESTINATION "lib"
-        RUNTIME DESTINATION "bin"
-        INCLUDES DESTINATION "${include_install_dir}")
+        EXPORT "${popsift_targets_export_name}"
+        LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+        ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+        RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+        INCLUDES DESTINATION "${popsift_include_install_dir}")
 
 # Headers:
 install(DIRECTORY "popsift"
-        DESTINATION "${include_install_dir}"
+        DESTINATION "${popsift_include_install_dir}"
+        FILES_MATCHING PATTERN "*.hpp" PATTERN "*.h")
+
+install(DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/popsift"
+        DESTINATION "${popsift_include_install_dir}"
         FILES_MATCHING PATTERN "*.hpp" PATTERN "*.h")
 
 # Config
 #   * <prefix>/lib/cmake/${PROJECT_NAME}/${PROJECT_NAME}Config.cmake
 #   * <prefix>/lib/cmake/${PROJECT_NAME}${PROJECT_NAME}ConfigVersion.cmake
-install(FILES "${project_config}" "${version_config}"
-        DESTINATION "${config_install_dir}")
+install(FILES "${popsift_cmake_project_config}" "${popsift_cmake_version_config}"
+        DESTINATION "${popsift_config_install_dir}")
 
 # Config
 #   * <prefix>/lib/cmake/${PROJECT_NAME}/${PROJECT_NAME}Targets.cmake
-install(EXPORT "${targets_export_name}"
-        NAMESPACE "${namespace}"
-        DESTINATION "${config_install_dir}")
+install(EXPORT "${popsift_targets_export_name}"
+        NAMESPACE "${popsift_namespace}"
+        DESTINATION "${popsift_config_install_dir}")
+
+# install the config and version generated files
+install( FILES "${popsift_config_header_name}"
+		DESTINATION "${popsift_install_dir_generated}")
+
+install( FILES "${popsift_version_header_name}"
+		DESTINATION "${popsift_install_dir_generated}")
+
+# Generate ${PROJECT_NAME}Targets.cmake in the build directory to use the library without installing it
+export(TARGETS popsift FILE "${popsift_generated_dir}/${popsift_targets_export_name}.cmake")
 
 if(PopSift_BUILD_EXAMPLES)
   add_subdirectory(application)
diff --git a/src/application/CMakeLists.txt b/src/application/CMakeLists.txt
index 468b6437..2379c57d 100755
--- a/src/application/CMakeLists.txt
+++ b/src/application/CMakeLists.txt
@@ -1,5 +1,29 @@
-cmake_minimum_required(VERSION 3.0)
-project(PopsiftDemo)
+if(NOT CMAKE_SOURCE_DIR STREQUAL PROJECT_SOURCE_DIR)
+  # I am top-level project, i.e. I am not being include by another project
+  cmake_minimum_required(VERSION 3.12)
+  project(PopsiftDemo LANGUAGES CXX)
+
+  option(PopSift_BOOST_USE_STATIC_LIBS "Link examples with static Boost libraries" OFF)
+  option(BUILD_SHARED_LIBS "Build shared libraries" ON)
+
+  include(GNUInstallDirs)
+
+  set(CMAKE_POSITION_INDEPENDENT_CODE ${BUILD_SHARED_LIBS})
+
+  set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
+  set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
+endif()
+
+# enable -fPIE for executables when -fpic
+# https://cmake.org/cmake/help/v3.17/policy/CMP0083.html
+if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.14)
+  cmake_policy(SET CMP0083 NEW)
+  include(CheckPIESupported)
+  check_pie_supported()
+elseif(CMAKE_POSITION_INDEPENDENT_CODE AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  # manually add the link flag for gcc
+  list(APPEND CMAKE_EXE_LINKER_FLAGS "-pie")
+endif()
 
 if(TARGET popsift)
   # when compiled in the repository the target is already defined
@@ -10,10 +34,18 @@ else()
   # See 5:
   #    * http://www.cmake.org/cmake/help/v3.0/command/find_package.html
   find_package(PopSift CONFIG REQUIRED)
+  message(STATUS "Found PopSift, version: ${PopSift_VERSION}")
 endif()
 
-find_package(Boost 1.53.0 REQUIRED COMPONENTS program_options system filesystem)
-find_package(DevIL REQUIRED COMPONENTS IL ILU) # yields IL_FOUND, IL_LIBRARIES, IL_INCLUDE_DIR
+find_package(DevIL COMPONENTS IL ILU) # yields IL_FOUND, IL_LIBRARIES, IL_INCLUDE_DIR
+
+if(PopSift_BOOST_USE_STATIC_LIBS)
+  set(Boost_USE_STATIC_LIBS ON)
+endif()
+find_package(Boost 1.71.0 REQUIRED COMPONENTS filesystem program_options system)
+if(WIN32)
+  add_definitions("-DBOOST_ALL_NO_LIB")
+endif(WIN32)
 
 set(PD_INCLUDE_DIRS    ${Boost_INCLUDE_DIRS})
 set(PD_LINK_LIBS       ${Boost_LIBRARIES} ${CUDA_CUDADEVRT_LIBRARY})
@@ -24,14 +56,10 @@ if(IL_FOUND OR DevIL_FOUND)
   list(APPEND PD_INCLUDE_DIRS ${IL_INCLUDE_DIR})
   list(APPEND PD_LINK_LIBS    ${IL_LIBRARIES} ${ILU_LIBRARIES})
 else()
-  message(STATUS "DevIL not found")
+  message(WARNING "DevIL not found -- Falling back to pgmread")
   set(PD_COMPILE_OPTIONS "" )
 endif()
 
-if(PopSift_USE_NVTX_PROFILING)
-  list(APPEND PD_LINK_LIBS ${CUDA_NVTX_LIBRARY})
-endif(PopSift_USE_NVTX_PROFILING)
-
 #############################################################
 # popsift-demo
 #############################################################
@@ -41,11 +69,10 @@ add_executable(popsift-demo  main.cpp pgmread.cpp pgmread.h)
 set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 11)
 
 target_compile_options(popsift-demo PRIVATE ${PD_COMPILE_OPTIONS} )
-target_include_directories(popsift-demo PUBLIC ${PD_INCLUDE_DIRS})
-target_compile_definitions(popsift-demo PRIVATE ${Boost_DEFINITIONS} BOOST_ALL_DYN_LINK BOOST_ALL_NO_LIB)
+target_include_directories(popsift-demo PUBLIC PopSift::popsift ${PD_INCLUDE_DIRS})
+target_compile_definitions(popsift-demo PRIVATE ${Boost_DEFINITIONS})
 target_link_libraries(popsift-demo PUBLIC PopSift::popsift ${PD_LINK_LIBS})
 
-set_target_properties(popsift-demo  PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" )
 
 #############################################################
 # popsift-match
@@ -56,14 +83,12 @@ add_executable(popsift-match match.cpp pgmread.cpp pgmread.h)
 set_property(TARGET popsift-match PROPERTY CXX_STANDARD 11)
 
 target_compile_options(popsift-match PRIVATE ${PD_COMPILE_OPTIONS} )
-target_include_directories(popsift-match PUBLIC ${PD_INCLUDE_DIRS})
-target_compile_definitions(popsift-match PRIVATE ${Boost_DEFINITIONS} BOOST_ALL_DYN_LINK BOOST_ALL_NO_LIB)
+target_include_directories(popsift-match PUBLIC PopSift::popsift ${PD_INCLUDE_DIRS})
+target_compile_definitions(popsift-match PRIVATE ${Boost_DEFINITIONS})
 target_link_libraries(popsift-match PUBLIC PopSift::popsift ${PD_LINK_LIBS})
 
-set_target_properties(popsift-match  PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" )
-
 #############################################################
 # installation
 #############################################################
 
-install(TARGETS popsift-demo DESTINATION bin)
+install(TARGETS popsift-demo DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/src/application/main.cpp b/src/application/main.cpp
index 7f0e7861..bf1128ff 100755
--- a/src/application/main.cpp
+++ b/src/application/main.cpp
@@ -5,37 +5,31 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <string>
+#include <popsift/common/device_prop.h>
+#include <popsift/features.h>
+#include <popsift/popsift.h>
+#include <popsift/sift_conf.h>
+#include <popsift/sift_config.h>
+#include <popsift/version.hpp>
+
+#include <boost/filesystem.hpp>
+#include <boost/program_options.hpp>
+
 #include <cmath>
+#include <cstdlib>
+#include <fstream>
 #include <iomanip>
-#include <stdlib.h>
-#include <stdexcept>
+#include <iostream>
 #include <list>
+#include <sstream>
+#include <stdexcept>
 #include <string>
 
-#include <boost/program_options.hpp>
-#include <boost/filesystem.hpp>
-
-#include <popsift/popsift.h>
-#include <popsift/features.h>
-#include <popsift/sift_conf.h>
-#include <popsift/common/device_prop.h>
-
 #ifdef USE_DEVIL
 #include <devil_cpp_wrapper.hpp>
 #endif
 #include "pgmread.h"
 
-#ifdef USE_NVTX
-#include <nvToolsExtCuda.h>
-#else
-#define nvtxRangePushA(a)
-#define nvtxRangePop()
-#endif
-
 using namespace std;
 
 static bool print_dev_info  = false;
@@ -135,7 +129,7 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& in
 
        if (vm.count("help")) {
            std::cout << all << '\n';
-           exit(1);
+           exit(EXIT_SUCCESS);
        }
 
         notify(vm); // Notify does processing (e.g., raise exceptions if required args are missing)
@@ -155,25 +149,26 @@ static void collectFilenames( list<string>& inputFiles, const boost::filesystem:
     std::copy( boost::filesystem::directory_iterator( inputFile ),
                boost::filesystem::directory_iterator(),
                std::back_inserter(vec) );
-    for( auto it = vec.begin(); it!=vec.end(); it++ ) {
-        if( boost::filesystem::is_regular_file( *it ) ) {
-            string s( it->c_str() );
-            inputFiles.push_back( s );
-        } else if( boost::filesystem::is_directory( *it ) ) {
-            collectFilenames( inputFiles, *it );
+    for (const auto& currPath : vec)
+    {
+        if( boost::filesystem::is_regular_file(currPath) )
+        {
+            inputFiles.push_back( currPath.string() );
+        }
+        else if( boost::filesystem::is_directory(currPath) )
+        {
+            collectFilenames( inputFiles, currPath);
         }
     }
 }
 
 SiftJob* process_image( const string& inputFile, PopSift& PopSift )
 {
-    int w;
-    int h;
     SiftJob* job;
     unsigned char* image_data;
 
 #ifdef USE_DEVIL
-    if( not pgmread_loading )
+    if( ! pgmread_loading )
     {
         if( float_mode )
         {
@@ -181,8 +176,6 @@ SiftJob* process_image( const string& inputFile, PopSift& PopSift )
             exit( -1 );
         }
 
-        nvtxRangePushA( "load and convert image - devil" );
-
         ilImage img;
         if( img.Load( inputFile.c_str() ) == false ) {
             cerr << "Could not load image " << inputFile << endl;
@@ -192,14 +185,12 @@ SiftJob* process_image( const string& inputFile, PopSift& PopSift )
             cerr << "Failed converting image " << inputFile << " to unsigned greyscale image" << endl;
             exit( -1 );
         }
-        w = img.Width();
-        h = img.Height();
+        const auto w = img.Width();
+        const auto h = img.Height();
         cout << "Loading " << w << " x " << h << " image " << inputFile << endl;
 
         image_data = img.GetData();
 
-        nvtxRangePop( ); // "load and convert image - devil"
-
         job = PopSift.enqueue( w, h, image_data );
 
         img.Clear();
@@ -207,16 +198,14 @@ SiftJob* process_image( const string& inputFile, PopSift& PopSift )
     else
 #endif
     {
-        nvtxRangePushA( "load and convert image - pgmread" );
-
+        int w{};
+        int h{};
         image_data = readPGMfile( inputFile, w, h );
-        if( image_data == 0 ) {
-            exit( -1 );
+        if( image_data == nullptr ) {
+            exit( EXIT_FAILURE );
         }
 
-        nvtxRangePop( ); // "load and convert image - pgmread"
-
-        if( not float_mode )
+        if( ! float_mode )
         {
             // PopSift.init( w, h );
             job = PopSift.enqueue( w, h, image_data );
@@ -225,7 +214,7 @@ SiftJob* process_image( const string& inputFile, PopSift& PopSift )
         }
         else
         {
-            float* f_image_data = new float [w * h];
+            auto f_image_data = new float [w * h];
             for( int i=0; i<w*h; i++ )
             {
                 f_image_data[i] = float( image_data[i] ) / 256.0f;
@@ -248,26 +237,21 @@ void read_job( SiftJob* job, bool really_write )
          << endl;
 
     if( really_write ) {
-        nvtxRangePushA( "Writing features to disk" );
-
         std::ofstream of( "output-features.txt" );
         feature_list->print( of, write_as_uchar );
     }
     delete feature_list;
-
-    if( really_write ) {
-        nvtxRangePop( ); // Writing features to disk
-    }
 }
 
 int main(int argc, char **argv)
 {
-    cudaDeviceReset();
+    popsift::cuda::reset();
 
     popsift::Config config;
     list<string>   inputFiles;
-    string         inputFile = "";
-    const char*    appName   = argv[0];
+    string         inputFile{};
+
+    std::cout << "PopSift version: " << POPSIFT_VERSION_STRING << std::endl;
 
     try {
         parseargs( argc, argv, config, inputFile ); // Parse command line
@@ -275,7 +259,7 @@ int main(int argc, char **argv)
     }
     catch (std::exception& e) {
         std::cout << e.what() << std::endl;
-        exit(1);
+        return EXIT_FAILURE;
     }
 
     if( boost::filesystem::exists( inputFile ) ) {
@@ -284,13 +268,13 @@ int main(int argc, char **argv)
             collectFilenames( inputFiles, inputFile );
             if( inputFiles.empty() ) {
                 cerr << "No files in directory, nothing to do" << endl;
-                exit( 0 );
+                return EXIT_SUCCESS;
             }
         } else if( boost::filesystem::is_regular_file( inputFile ) ) {
             inputFiles.push_back( inputFile );
         } else {
             cout << "Input file is neither regular file nor directory, nothing to do" << endl;
-            exit( -1 );
+            return EXIT_FAILURE;
         }
     }
 
@@ -303,10 +287,9 @@ int main(int argc, char **argv)
                      float_mode ? PopSift::FloatImages : PopSift::ByteImages );
 
     std::queue<SiftJob*> jobs;
-    for( auto it = inputFiles.begin(); it!=inputFiles.end(); it++ ) {
-        inputFile = it->c_str();
-
-        SiftJob* job = process_image( inputFile, PopSift );
+    for(const auto& currFile : inputFiles)
+    {
+        SiftJob* job = process_image( currFile, PopSift );
         jobs.push( job );
     }
 
@@ -315,11 +298,13 @@ int main(int argc, char **argv)
         SiftJob* job = jobs.front();
         jobs.pop();
         if( job ) {
-            read_job( job, not dont_write );
+            read_job( job, ! dont_write );
             delete job;
         }
     }
 
     PopSift.uninit( );
+
+    return EXIT_SUCCESS;
 }
 
diff --git a/src/application/match.cpp b/src/application/match.cpp
index 9849b92a..3460975d 100755
--- a/src/application/match.cpp
+++ b/src/application/match.cpp
@@ -5,44 +5,38 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <string>
+#include <popsift/common/device_prop.h>
+#include <popsift/features.h>
+#include <popsift/popsift.h>
+#include <popsift/sift_conf.h>
+#include <popsift/sift_config.h>
+#include <popsift/version.hpp>
+
+#include <boost/filesystem.hpp>
+#include <boost/program_options.hpp>
+
 #include <cmath>
+#include <cstdlib>
+#include <fstream>
 #include <iomanip>
-#include <stdlib.h>
-#include <stdexcept>
+#include <iostream>
 #include <list>
+#include <sstream>
+#include <stdexcept>
 #include <string>
 
-#include <boost/program_options.hpp>
-#include <boost/filesystem.hpp>
-
-#include <popsift/popsift.h>
-#include <popsift/features.h>
-#include <popsift/sift_conf.h>
-#include <popsift/common/device_prop.h>
-
 #ifdef USE_DEVIL
 #include <devil_cpp_wrapper.hpp>
 #endif
 #include "pgmread.h"
 
-#ifdef USE_NVTX
-#include <nvToolsExtCuda.h>
-#else
-#define nvtxRangePushA(a)
-#define nvtxRangePop()
-#endif
-
 using namespace std;
 
-static bool print_dev_info  = false;
-static bool print_time_info = false;
-static bool write_as_uchar  = false;
-static bool dont_write      = false;
-static bool pgmread_loading = false;
+static bool print_dev_info  {false};
+static bool print_time_info {false};
+static bool write_as_uchar  {false};
+static bool dont_write      {false};
+static bool pgmread_loading {false};
 
 static void parseargs(int argc, char** argv, popsift::Config& config, string& lFile, string& rFile) {
     using namespace boost::program_options;
@@ -151,26 +145,27 @@ static void collectFilenames( list<string>& inputFiles, const boost::filesystem:
     std::copy( boost::filesystem::directory_iterator( inputFile ),
                boost::filesystem::directory_iterator(),
                std::back_inserter(vec) );
-    for( auto it = vec.begin(); it!=vec.end(); it++ ) {
-        if( boost::filesystem::is_regular_file( *it ) ) {
-            string s( it->c_str() );
-            inputFiles.push_back( s );
-        } else if( boost::filesystem::is_directory( *it ) ) {
-            collectFilenames( inputFiles, *it );
+    for (const auto& currPath : vec)
+    {
+        if( boost::filesystem::is_regular_file(currPath) )
+        {
+            inputFiles.push_back( currPath.string() );
+
+        }
+        else if( boost::filesystem::is_directory(currPath) )
+        {
+            collectFilenames( inputFiles, currPath);
         }
     }
 }
 
 SiftJob* process_image( const string& inputFile, PopSift& PopSift )
 {
-    int w;
-    int h;
     unsigned char* image_data;
     SiftJob* job;
 
-    nvtxRangePushA( "load and convert image" );
 #ifdef USE_DEVIL
-    if( not pgmread_loading )
+    if( ! pgmread_loading )
     {
         ilImage img;
         if( img.Load( inputFile.c_str() ) == false ) {
@@ -181,13 +176,11 @@ SiftJob* process_image( const string& inputFile, PopSift& PopSift )
             cerr << "Failed converting image " << inputFile << " to unsigned greyscale image" << endl;
             exit( -1 );
         }
-        w = img.Width();
-        h = img.Height();
+        const auto w = img.Width();
+        const auto h = img.Height();
         cout << "Loading " << w << " x " << h << " image " << inputFile << endl;
         image_data = img.GetData();
 
-        nvtxRangePop( );
-
         // PopSift.init( w, h );
         job = PopSift.enqueue( w, h, image_data );
 
@@ -196,13 +189,13 @@ SiftJob* process_image( const string& inputFile, PopSift& PopSift )
     else
 #endif
     {
+        int h{};
+        int w{};
         image_data = readPGMfile( inputFile, w, h );
-        if( image_data == 0 ) {
-            exit( -1 );
+        if( image_data == nullptr ) {
+            exit( EXIT_FAILURE );
         }
 
-        nvtxRangePop( );
-
         // PopSift.init( w, h );
         job = PopSift.enqueue( w, h, image_data );
 
@@ -214,12 +207,13 @@ SiftJob* process_image( const string& inputFile, PopSift& PopSift )
 
 int main(int argc, char **argv)
 {
-    cudaDeviceReset();
+    popsift::cuda::reset();
 
     popsift::Config config;
-    string         lFile = "";
-    string         rFile = "";
-    const char*    appName   = argv[0];
+    string         lFile{};
+    string         rFile{};
+
+    std::cout << "PopSift version: " << POPSIFT_VERSION_STRING << std::endl;
 
     try {
         parseargs( argc, argv, config, lFile, rFile ); // Parse command line
@@ -227,20 +221,20 @@ int main(int argc, char **argv)
     }
     catch (std::exception& e) {
         std::cout << e.what() << std::endl;
-        exit(1);
+        return EXIT_SUCCESS;
     }
 
     if( boost::filesystem::exists( lFile ) ) {
-        if( not boost::filesystem::is_regular_file( lFile ) ) {
+        if( ! boost::filesystem::is_regular_file( lFile ) ) {
             cout << "Input file " << lFile << " is not a regular file, nothing to do" << endl;
-            exit( -1 );
+            return EXIT_FAILURE;
         }
     }
 
     if( boost::filesystem::exists( rFile ) ) {
-        if( not boost::filesystem::is_regular_file( rFile ) ) {
+        if( ! boost::filesystem::is_regular_file( rFile ) ) {
             cout << "Input file " << rFile << " is not a regular file, nothing to do" << endl;
-            exit( -1 );
+            return EXIT_FAILURE;
         }
     }
 
@@ -267,5 +261,7 @@ int main(int argc, char **argv)
     delete rFeatures;
 
     PopSift.uninit( );
+
+    return EXIT_SUCCESS;
 }
 
diff --git a/src/application/pgmread.cpp b/src/application/pgmread.cpp
index 50ae310f..91a812f9 100644
--- a/src/application/pgmread.cpp
+++ b/src/application/pgmread.cpp
@@ -5,14 +5,14 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
-#include <stdlib.h>
-#include <iso646.h>
-#include <iostream>
-#include <fstream>
-#include <boost/filesystem.hpp>
+#include "pgmread.h"
+
 #include <boost/algorithm/string/trim.hpp>
+#include <boost/filesystem.hpp>
 
-#include "pgmread.h"
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
 
 #define RGB2GRAY_IN_INT
 
@@ -39,15 +39,15 @@ unsigned char* readPGMfile( const string& filename, int& w, int& h )
 {
     boost::filesystem::path input_file( filename );
 
-    if( not boost::filesystem::exists( input_file ) ) {
+    if( ! boost::filesystem::exists( input_file ) ) {
         cerr << "File " << input_file << " does not exist" << endl;
-        return 0;
+        return nullptr;
     }
 
     ifstream pgmfile( filename.c_str(), ios::binary );
-    if( not pgmfile.is_open() ) {
+    if( ! pgmfile.is_open() ) {
         cerr << "File " << input_file << " could not be opened for reading" << endl;
-        return 0;
+        return nullptr;
     }
 
     string pgmtype;
@@ -55,7 +55,7 @@ unsigned char* readPGMfile( const string& filename, int& w, int& h )
         getline( pgmfile, pgmtype ); // this is the string version of getline()
         if( pgmfile.fail() ) {
             cerr << "File " << input_file << " is too short" << endl;
-            return 0;
+            return nullptr;
         }
         boost::algorithm::trim_left( pgmtype ); // nice because of trim
     } while( pgmtype.at(0) == '#' );
@@ -67,19 +67,20 @@ unsigned char* readPGMfile( const string& filename, int& w, int& h )
     else if( pgmtype.substr(0,2) == "P6" ) type = 6;
     else {
         cerr << "File " << input_file << " can only contain P2, P3, P5 or P6 PGM images" << endl;
-        return 0;
+        return nullptr;
     }
 
-    char  line[1000];
-    char* parse;
-    int   maxval;
+    const int maxLineSize{1000};
+    char  line[maxLineSize];
+    char* parse{nullptr};
+    int   maxval{};
 
     do {
-        pgmfile.getline( line, 1000 );
+        pgmfile.getline( line, maxLineSize );
 
         if( pgmfile.fail() ) {
             cerr << "File " << input_file << " is too short" << endl;
-            return 0;
+            return nullptr;
         }
         int num = pgmfile.gcount();
         parse = line;
@@ -87,25 +88,25 @@ unsigned char* readPGMfile( const string& filename, int& w, int& h )
             parse++;
         }
         if( *parse == '#' ) continue;
-        int ct = sscanf( parse, "%d %d", &w, &h );
+        const int ct = sscanf( parse, "%d %d", &w, &h );
         if( ct != 2 ) {
             cerr << "Error in " << __FILE__ << ":" << __LINE__ << endl
                  << "File " << input_file << " PGM type header (" << type << ") must be followed by comments and WxH info" << endl
                  << "but line contains " << parse << endl;
-            return 0;
+            return nullptr;
         }
     } while( *parse == '#' );
 
     if( w <= 0 || h <= 0 ) {
         cerr << "File " << input_file << " has meaningless image size" << endl;
-        return 0;
+        return nullptr;
     }
 
     do {
-        pgmfile.getline( line, 1000 );
+        pgmfile.getline( line, maxLineSize );
         if( pgmfile.fail() ) {
             cerr << "File " << input_file << " is too short" << endl;
-            return 0;
+            return nullptr;
         }
         int num = pgmfile.gcount();
         parse = line;
@@ -113,14 +114,14 @@ unsigned char* readPGMfile( const string& filename, int& w, int& h )
             parse++;
         }
         if( *parse == '#' ) continue;
-        int ct = sscanf( parse, "%d", &maxval );
+        const int ct = sscanf( parse, "%d", &maxval );
         if( ct != 1 ) {
             cerr << "File " << input_file << " PGM dimensions must be followed by comments and max value info" << endl;
-            return 0;
+            return nullptr;
         }
     } while( *parse == '#' );
 
-    unsigned char* input_data = new unsigned char[ w * h ];
+    auto input_data = new unsigned char[ w * h ];
 
     switch( type )
     {
@@ -136,13 +137,13 @@ unsigned char* readPGMfile( const string& filename, int& w, int& h )
             if( pgmfile.fail() ) {
                 cerr << "File " << input_file << " file too short" << endl;
                 delete [] input_data;
-                return 0;
+                return nullptr;
             }
         }
         break;
     case 3 :
         {
-            unsigned char* i2 = new unsigned char[ w * h * 3 ];
+            auto i2 = new unsigned char[ w * h * 3 ];
             unsigned char* src = i2;
             for( int i=0; i<w*h*3; i++ ) {
                 int input;
@@ -156,20 +157,20 @@ unsigned char* readPGMfile( const string& filename, int& w, int& h )
                     cerr << "File " << input_file << " file too short" << endl;
                     delete [] i2;
                     delete [] input_data;
-                    return 0;
+                    return nullptr;
                 }
             }
             for( int i=0; i<w*h; i++ ) {
 #ifdef RGB2GRAY_IN_INT
-                unsigned int r = *src; src++;
-                unsigned int g = *src; src++;
-                unsigned int b = *src; src++;
-                unsigned int res = ( ( R_RATE*r+G_RATE*g+B_RATE*b ) >> RATE_SHIFT );
+                const unsigned int r = *src; src++;
+                const unsigned int g = *src; src++;
+                const unsigned int b = *src; src++;
+                const unsigned int res = ( ( R_RATE*r+G_RATE*g+B_RATE*b ) >> RATE_SHIFT );
                 input_data[i] = (unsigned char)res;
 #else // RGB2GRAY_IN_INT
-                float r = *src; src++;
-                float g = *src; src++;
-                float b = *src; src++;
+                const float r = *src; src++;
+                const float g = *src; src++;
+                const float b = *src; src++;
                 input_data[i] = (unsigned char)( R_RATE*r+G_RATE*g+B_RATE*b );
 #endif // RGB2GRAY_IN_INT
             }
@@ -180,13 +181,13 @@ unsigned char* readPGMfile( const string& filename, int& w, int& h )
         if( maxval < 256 ) {
             pgmfile.read( (char*)input_data, w*h );
         } else {
-            unsigned short* i2 = new unsigned short[ w * h ];
+            auto i2 = new unsigned short[ w * h ];
             pgmfile.read( (char*)i2, w*h*2 );
             if( pgmfile.fail() ) {
                 cerr << "File " << input_file << " file too short" << endl;
                 delete [] i2;
                 delete [] input_data;
-                return 0;
+                return nullptr;
             }
             for( int i=0; i<w*h; i++ ) {
                 input_data[i] = (unsigned char)(i2[i] * 255.0 / maxval );
@@ -196,14 +197,14 @@ unsigned char* readPGMfile( const string& filename, int& w, int& h )
         break;
     case 6 :
         if( maxval < 256 ) {
-            unsigned char* i2 = new unsigned char[ w * h * 3 ];
+            auto i2 = new unsigned char[ w * h * 3 ];
             unsigned char* src = i2;
             pgmfile.read( (char*)i2, w*h*3 );
             if( pgmfile.fail() ) {
                 cerr << "File " << input_file << " file too short" << endl;
                 delete [] i2;
                 delete [] input_data;
-                return 0;
+                return nullptr;
             }
             for( int i=0; i<w*h; i++ ) {
 #ifdef RGB2GRAY_IN_INT
@@ -221,7 +222,7 @@ unsigned char* readPGMfile( const string& filename, int& w, int& h )
             }
             delete [] i2;
         } else {
-            unsigned short* i2 = new unsigned short[ w * h * 2 * 3 ];
+            auto i2 = new unsigned short[ w * h * 2 * 3 ];
             unsigned short* src = i2;
             pgmfile.read( (char*)i2, w*h*2*3 );
             if( pgmfile.fail() ) {
@@ -247,6 +248,9 @@ unsigned char* readPGMfile( const string& filename, int& w, int& h )
             delete [] i2;
         }
         break;
+
+    default:
+        throw std::runtime_error("unsupported type " + std::to_string(type));
     }
 
     return input_data;
diff --git a/src/cmake/FindCUDA.cmake b/src/cmake/FindCUDA.cmake
deleted file mode 100755
index e9a2505f..00000000
--- a/src/cmake/FindCUDA.cmake
+++ /dev/null
@@ -1,1796 +0,0 @@
-#.rst:
-# FindCUDA
-# --------
-#
-# Tools for building CUDA C files: libraries and build dependencies.
-#
-# This script locates the NVIDIA CUDA C tools.  It should work on linux,
-# windows, and mac and should be reasonably up to date with CUDA C
-# releases.
-#
-# This script makes use of the standard find_package arguments of
-# <VERSION>, REQUIRED and QUIET.  CUDA_FOUND will report if an
-# acceptable version of CUDA was found.
-#
-# The script will prompt the user to specify CUDA_TOOLKIT_ROOT_DIR if
-# the prefix cannot be determined by the location of nvcc in the system
-# path and REQUIRED is specified to find_package().  To use a different
-# installed version of the toolkit set the environment variable
-# CUDA_BIN_PATH before running cmake (e.g.
-# CUDA_BIN_PATH=/usr/local/cuda1.0 instead of the default
-# /usr/local/cuda) or set CUDA_TOOLKIT_ROOT_DIR after configuring.  If
-# you change the value of CUDA_TOOLKIT_ROOT_DIR, various components that
-# depend on the path will be relocated.
-#
-# It might be necessary to set CUDA_TOOLKIT_ROOT_DIR manually on certain
-# platforms, or to use a cuda runtime not installed in the default
-# location.  In newer versions of the toolkit the cuda library is
-# included with the graphics driver- be sure that the driver version
-# matches what is needed by the cuda runtime version.
-#
-# The following variables affect the behavior of the macros in the
-# script (in alphebetical order).  Note that any of these flags can be
-# changed multiple times in the same directory before calling
-# CUDA_ADD_EXECUTABLE, CUDA_ADD_LIBRARY, CUDA_COMPILE, CUDA_COMPILE_PTX
-# or CUDA_WRAP_SRCS.
-#
-# ::
-#
-#   CUDA_64_BIT_DEVICE_CODE (Default matches host bit size)
-#   -- Set to ON to compile for 64 bit device code, OFF for 32 bit device code.
-#      Note that making this different from the host code when generating object
-#      or C files from CUDA code just won't work, because size_t gets defined by
-#      nvcc in the generated source.  If you compile to PTX and then load the
-#      file yourself, you can mix bit sizes between device and host.
-#
-#
-#
-# ::
-#
-#   CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE (Default ON)
-#   -- Set to ON if you want the custom build rule to be attached to the source
-#      file in Visual Studio.  Turn OFF if you add the same cuda file to multiple
-#      targets.
-#
-#
-#
-# ::
-#
-#      This allows the user to build the target from the CUDA file; however, bad
-#      things can happen if the CUDA source file is added to multiple targets.
-#      When performing parallel builds it is possible for the custom build
-#      command to be run more than once and in parallel causing cryptic build
-#      errors.  VS runs the rules for every source file in the target, and a
-#      source can have only one rule no matter how many projects it is added to.
-#      When the rule is run from multiple targets race conditions can occur on
-#      the generated file.  Eventually everything will get built, but if the user
-#      is unaware of this behavior, there may be confusion.  It would be nice if
-#      this script could detect the reuse of source files across multiple targets
-#      and turn the option off for the user, but no good solution could be found.
-#
-#
-#
-# ::
-#
-#   CUDA_BUILD_CUBIN (Default OFF)
-#   -- Set to ON to enable and extra compilation pass with the -cubin option in
-#      Device mode. The output is parsed and register, shared memory usage is
-#      printed during build.
-#
-#
-#
-# ::
-#
-#   CUDA_BUILD_EMULATION (Default OFF for device mode)
-#   -- Set to ON for Emulation mode. -D_DEVICEEMU is defined for CUDA C files
-#      when CUDA_BUILD_EMULATION is TRUE.
-#
-#
-#
-# ::
-#
-#   CUDA_GENERATED_OUTPUT_DIR (Default CMAKE_CURRENT_BINARY_DIR)
-#   -- Set to the path you wish to have the generated files placed.  If it is
-#      blank output files will be placed in CMAKE_CURRENT_BINARY_DIR.
-#      Intermediate files will always be placed in
-#      CMAKE_CURRENT_BINARY_DIR/CMakeFiles.
-#
-#
-#
-# ::
-#
-#   CUDA_HOST_COMPILATION_CPP (Default ON)
-#   -- Set to OFF for C compilation of host code.
-#
-#
-#
-# ::
-#
-#   CUDA_HOST_COMPILER (Default CMAKE_C_COMPILER, $(VCInstallDir)/bin for VS)
-#   -- Set the host compiler to be used by nvcc.  Ignored if -ccbin or
-#      --compiler-bindir is already present in the CUDA_NVCC_FLAGS or
-#      CUDA_NVCC_FLAGS_<CONFIG> variables.  For Visual Studio targets
-#      $(VCInstallDir)/bin is a special value that expands out to the path when
-#      the command is run from withing VS.
-#
-#
-#
-# ::
-#
-#   CUDA_NVCC_FLAGS
-#   CUDA_NVCC_FLAGS_<CONFIG>
-#   -- Additional NVCC command line arguments.  NOTE: multiple arguments must be
-#      semi-colon delimited (e.g. --compiler-options;-Wall)
-#
-#
-#
-# ::
-#
-#   CUDA_PROPAGATE_HOST_FLAGS (Default ON)
-#   -- Set to ON to propagate CMAKE_{C,CXX}_FLAGS and their configuration
-#      dependent counterparts (e.g. CMAKE_C_FLAGS_DEBUG) automatically to the
-#      host compiler through nvcc's -Xcompiler flag.  This helps make the
-#      generated host code match the rest of the system better.  Sometimes
-#      certain flags give nvcc problems, and this will help you turn the flag
-#      propagation off.  This does not affect the flags supplied directly to nvcc
-#      via CUDA_NVCC_FLAGS or through the OPTION flags specified through
-#      CUDA_ADD_LIBRARY, CUDA_ADD_EXECUTABLE, or CUDA_WRAP_SRCS.  Flags used for
-#      shared library compilation are not affected by this flag.
-#
-#
-#
-# ::
-#
-#   CUDA_SEPARABLE_COMPILATION (Default OFF)
-#   -- If set this will enable separable compilation for all CUDA runtime object
-#      files.  If used outside of CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY
-#      (e.g. calling CUDA_WRAP_SRCS directly),
-#      CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME and
-#      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS should be called.
-#
-#
-#
-# ::
-#
-#   CUDA_VERBOSE_BUILD (Default OFF)
-#   -- Set to ON to see all the commands used when building the CUDA file.  When
-#      using a Makefile generator the value defaults to VERBOSE (run make
-#      VERBOSE=1 to see output), although setting CUDA_VERBOSE_BUILD to ON will
-#      always print the output.
-#
-#
-#
-# The script creates the following macros (in alphebetical order):
-#
-# ::
-#
-#   CUDA_ADD_CUFFT_TO_TARGET( cuda_target )
-#   -- Adds the cufft library to the target (can be any target).  Handles whether
-#      you are in emulation mode or not.
-#
-#
-#
-# ::
-#
-#   CUDA_ADD_CUBLAS_TO_TARGET( cuda_target )
-#   -- Adds the cublas library to the target (can be any target).  Handles
-#      whether you are in emulation mode or not.
-#
-#
-#
-# ::
-#
-#   CUDA_ADD_EXECUTABLE( cuda_target file0 file1 ...
-#                        [WIN32] [MACOSX_BUNDLE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
-#   -- Creates an executable "cuda_target" which is made up of the files
-#      specified.  All of the non CUDA C files are compiled using the standard
-#      build rules specified by CMAKE and the cuda files are compiled to object
-#      files using nvcc and the host compiler.  In addition CUDA_INCLUDE_DIRS is
-#      added automatically to include_directories().  Some standard CMake target
-#      calls can be used on the target after calling this macro
-#      (e.g. set_target_properties and target_link_libraries), but setting
-#      properties that adjust compilation flags will not affect code compiled by
-#      nvcc.  Such flags should be modified before calling CUDA_ADD_EXECUTABLE,
-#      CUDA_ADD_LIBRARY or CUDA_WRAP_SRCS.
-#
-#
-#
-# ::
-#
-#   CUDA_ADD_LIBRARY( cuda_target file0 file1 ...
-#                     [STATIC | SHARED | MODULE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
-#   -- Same as CUDA_ADD_EXECUTABLE except that a library is created.
-#
-#
-#
-# ::
-#
-#   CUDA_BUILD_CLEAN_TARGET()
-#   -- Creates a convience target that deletes all the dependency files
-#      generated.  You should make clean after running this target to ensure the
-#      dependency files get regenerated.
-#
-#
-#
-# ::
-#
-#   CUDA_COMPILE( generated_files file0 file1 ... [STATIC | SHARED | MODULE]
-#                 [OPTIONS ...] )
-#   -- Returns a list of generated files from the input source files to be used
-#      with ADD_LIBRARY or ADD_EXECUTABLE.
-#
-#
-#
-# ::
-#
-#   CUDA_COMPILE_PTX( generated_files file0 file1 ... [OPTIONS ...] )
-#   -- Returns a list of PTX files generated from the input source files.
-#
-#
-#
-# ::
-#
-#   CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME( output_file_var
-#                                                        cuda_target
-#                                                        object_files )
-#   -- Compute the name of the intermediate link file used for separable
-#      compilation.  This file name is typically passed into
-#      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS.  output_file_var is produced
-#      based on cuda_target the list of objects files that need separable
-#      compilation as specified by object_files.  If the object_files list is
-#      empty, then output_file_var will be empty.  This function is called
-#      automatically for CUDA_ADD_LIBRARY and CUDA_ADD_EXECUTABLE.  Note that
-#      this is a function and not a macro.
-#
-#
-#
-# ::
-#
-#   CUDA_INCLUDE_DIRECTORIES( path0 path1 ... )
-#   -- Sets the directories that should be passed to nvcc
-#      (e.g. nvcc -Ipath0 -Ipath1 ... ). These paths usually contain other .cu
-#      files.
-#
-#
-#
-#
-#
-# ::
-#
-#   CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS( output_file_var cuda_target
-#                                            nvcc_flags object_files)
-#
-#
-#
-# ::
-#
-#   -- Generates the link object required by separable compilation from the given
-#      object files.  This is called automatically for CUDA_ADD_EXECUTABLE and
-#      CUDA_ADD_LIBRARY, but can be called manually when using CUDA_WRAP_SRCS
-#      directly.  When called from CUDA_ADD_LIBRARY or CUDA_ADD_EXECUTABLE the
-#      nvcc_flags passed in are the same as the flags passed in via the OPTIONS
-#      argument.  The only nvcc flag added automatically is the bitness flag as
-#      specified by CUDA_64_BIT_DEVICE_CODE.  Note that this is a function
-#      instead of a macro.
-#
-#
-#
-# ::
-#
-#   CUDA_WRAP_SRCS ( cuda_target format generated_files file0 file1 ...
-#                    [STATIC | SHARED | MODULE] [OPTIONS ...] )
-#   -- This is where all the magic happens.  CUDA_ADD_EXECUTABLE,
-#      CUDA_ADD_LIBRARY, CUDA_COMPILE, and CUDA_COMPILE_PTX all call this
-#      function under the hood.
-#
-#
-#
-# ::
-#
-#      Given the list of files (file0 file1 ... fileN) this macro generates
-#      custom commands that generate either PTX or linkable objects (use "PTX" or
-#      "OBJ" for the format argument to switch).  Files that don't end with .cu
-#      or have the HEADER_FILE_ONLY property are ignored.
-#
-#
-#
-# ::
-#
-#      The arguments passed in after OPTIONS are extra command line options to
-#      give to nvcc.  You can also specify per configuration options by
-#      specifying the name of the configuration followed by the options.  General
-#      options must preceed configuration specific options.  Not all
-#      configurations need to be specified, only the ones provided will be used.
-#
-#
-#
-# ::
-#
-#         OPTIONS -DFLAG=2 "-DFLAG_OTHER=space in flag"
-#         DEBUG -g
-#         RELEASE --use_fast_math
-#         RELWITHDEBINFO --use_fast_math;-g
-#         MINSIZEREL --use_fast_math
-#
-#
-#
-# ::
-#
-#      For certain configurations (namely VS generating object files with
-#      CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE set to ON), no generated file will
-#      be produced for the given cuda file.  This is because when you add the
-#      cuda file to Visual Studio it knows that this file produces an object file
-#      and will link in the resulting object file automatically.
-#
-#
-#
-# ::
-#
-#      This script will also generate a separate cmake script that is used at
-#      build time to invoke nvcc.  This is for several reasons.
-#
-#
-#
-# ::
-#
-#        1. nvcc can return negative numbers as return values which confuses
-#        Visual Studio into thinking that the command succeeded.  The script now
-#        checks the error codes and produces errors when there was a problem.
-#
-#
-#
-# ::
-#
-#        2. nvcc has been known to not delete incomplete results when it
-#        encounters problems.  This confuses build systems into thinking the
-#        target was generated when in fact an unusable file exists.  The script
-#        now deletes the output files if there was an error.
-#
-#
-#
-# ::
-#
-#        3. By putting all the options that affect the build into a file and then
-#        make the build rule dependent on the file, the output files will be
-#        regenerated when the options change.
-#
-#
-#
-# ::
-#
-#      This script also looks at optional arguments STATIC, SHARED, or MODULE to
-#      determine when to target the object compilation for a shared library.
-#      BUILD_SHARED_LIBS is ignored in CUDA_WRAP_SRCS, but it is respected in
-#      CUDA_ADD_LIBRARY.  On some systems special flags are added for building
-#      objects intended for shared libraries.  A preprocessor macro,
-#      <target_name>_EXPORTS is defined when a shared library compilation is
-#      detected.
-#
-#
-#
-# ::
-#
-#      Flags passed into add_definitions with -D or /D are passed along to nvcc.
-#
-#
-#
-# The script defines the following variables:
-#
-# ::
-#
-#   CUDA_VERSION_MAJOR    -- The major version of cuda as reported by nvcc.
-#   CUDA_VERSION_MINOR    -- The minor version.
-#   CUDA_VERSION
-#   CUDA_VERSION_STRING   -- CUDA_VERSION_MAJOR.CUDA_VERSION_MINOR
-#
-#
-#
-# ::
-#
-#   CUDA_TOOLKIT_ROOT_DIR -- Path to the CUDA Toolkit (defined if not set).
-#   CUDA_SDK_ROOT_DIR     -- Path to the CUDA SDK.  Use this to find files in the
-#                            SDK.  This script will not directly support finding
-#                            specific libraries or headers, as that isn't
-#                            supported by NVIDIA.  If you want to change
-#                            libraries when the path changes see the
-#                            FindCUDA.cmake script for an example of how to clear
-#                            these variables.  There are also examples of how to
-#                            use the CUDA_SDK_ROOT_DIR to locate headers or
-#                            libraries, if you so choose (at your own risk).
-#   CUDA_INCLUDE_DIRS     -- Include directory for cuda headers.  Added automatically
-#                            for CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY.
-#   CUDA_LIBRARIES        -- Cuda RT library.
-#   CUDA_CUFFT_LIBRARIES  -- Device or emulation library for the Cuda FFT
-#                            implementation (alternative to:
-#                            CUDA_ADD_CUFFT_TO_TARGET macro)
-#   CUDA_CUBLAS_LIBRARIES -- Device or emulation library for the Cuda BLAS
-#                            implementation (alterative to:
-#                            CUDA_ADD_CUBLAS_TO_TARGET macro).
-#   CUDA_cupti_LIBRARY    -- CUDA Profiling Tools Interface library.
-#                            Only available for CUDA version 4.0+.
-#   CUDA_curand_LIBRARY   -- CUDA Random Number Generation library.
-#                            Only available for CUDA version 3.2+.
-#   CUDA_cusparse_LIBRARY -- CUDA Sparse Matrix library.
-#                            Only available for CUDA version 3.2+.
-#   CUDA_npp_LIBRARY      -- NVIDIA Performance Primitives library.
-#                            Only available for CUDA version 4.0+.
-#   CUDA_nppc_LIBRARY      -- NVIDIA Performance Primitives library (core).
-#                            Only available for CUDA version 5.5+.
-#   CUDA_nppi_LIBRARY      -- NVIDIA Performance Primitives library (image processing).
-#                            Only available for CUDA version 5.5+.
-#   CUDA_npps_LIBRARY      -- NVIDIA Performance Primitives library (signal processing).
-#                            Only available for CUDA version 5.5+.
-#   CUDA_nvcuvenc_LIBRARY -- CUDA Video Encoder library.
-#                            Only available for CUDA version 3.2+.
-#                            Windows only.
-#   CUDA_nvcuvid_LIBRARY  -- CUDA Video Decoder library.
-#                            Only available for CUDA version 3.2+.
-#                            Windows only.
-#
-#
-#
-#
-#
-# ::
-#
-#   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
-#
-#
-#
-# ::
-#
-#   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#
-#
-# ::
-#
-#   Copyright (c) 2007-2009
-#   Scientific Computing and Imaging Institute, University of Utah
-#
-#
-#
-# ::
-#
-#   This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#   for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-###############################################################################
-
-# FindCUDA.cmake
-
-# We need to have at least this version to support the VERSION_LESS argument to 'if' (2.6.2) and unset (2.6.3)
-cmake_policy(PUSH)
-cmake_minimum_required(VERSION 2.6.3)
-cmake_policy(POP)
-
-# This macro helps us find the location of helper files we will need the full path to
-macro(CUDA_FIND_HELPER_FILE _name _extension)
-  set(_full_name "${_name}.${_extension}")
-  # CMAKE_CURRENT_LIST_FILE contains the full path to the file currently being
-  # processed.  Using this variable, we can pull out the current path, and
-  # provide a way to get access to the other files we need local to here.
-  get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
-  set(CUDA_${_name} "${CMAKE_CURRENT_LIST_DIR}/FindCUDA/${_full_name}")
-  if(NOT EXISTS "${CUDA_${_name}}")
-    set(error_message "${_full_name} not found in ${CMAKE_CURRENT_LIST_DIR}/FindCUDA")
-    if(CUDA_FIND_REQUIRED)
-      message(FATAL_ERROR "${error_message}")
-    else()
-      if(NOT CUDA_FIND_QUIETLY)
-        message(STATUS "${error_message}")
-      endif()
-    endif()
-  endif()
-  # Set this variable as internal, so the user isn't bugged with it.
-  set(CUDA_${_name} ${CUDA_${_name}} CACHE INTERNAL "Location of ${_full_name}" FORCE)
-endmacro()
-
-#####################################################################
-## CUDA_INCLUDE_NVCC_DEPENDENCIES
-##
-
-# So we want to try and include the dependency file if it exists.  If
-# it doesn't exist then we need to create an empty one, so we can
-# include it.
-
-# If it does exist, then we need to check to see if all the files it
-# depends on exist.  If they don't then we should clear the dependency
-# file and regenerate it later.  This covers the case where a header
-# file has disappeared or moved.
-
-macro(CUDA_INCLUDE_NVCC_DEPENDENCIES dependency_file)
-  set(CUDA_NVCC_DEPEND)
-  set(CUDA_NVCC_DEPEND_REGENERATE FALSE)
-
-
-  # Include the dependency file.  Create it first if it doesn't exist .  The
-  # INCLUDE puts a dependency that will force CMake to rerun and bring in the
-  # new info when it changes.  DO NOT REMOVE THIS (as I did and spent a few
-  # hours figuring out why it didn't work.
-  if(NOT EXISTS ${dependency_file})
-    file(WRITE ${dependency_file} "#FindCUDA.cmake generated file.  Do not edit.\n")
-  endif()
-  # Always include this file to force CMake to run again next
-  # invocation and rebuild the dependencies.
-  #message("including dependency_file = ${dependency_file}")
-  include(${dependency_file})
-
-  # Now we need to verify the existence of all the included files
-  # here.  If they aren't there we need to just blank this variable and
-  # make the file regenerate again.
-#   if(DEFINED CUDA_NVCC_DEPEND)
-#     message("CUDA_NVCC_DEPEND set")
-#   else()
-#     message("CUDA_NVCC_DEPEND NOT set")
-#   endif()
-  if(CUDA_NVCC_DEPEND)
-    #message("CUDA_NVCC_DEPEND found")
-    foreach(f ${CUDA_NVCC_DEPEND})
-      # message("searching for ${f}")
-      if(NOT EXISTS ${f})
-        #message("file ${f} not found")
-        set(CUDA_NVCC_DEPEND_REGENERATE TRUE)
-      endif()
-    endforeach()
-  else()
-    #message("CUDA_NVCC_DEPEND false")
-    # No dependencies, so regenerate the file.
-    set(CUDA_NVCC_DEPEND_REGENERATE TRUE)
-  endif()
-
-  #message("CUDA_NVCC_DEPEND_REGENERATE = ${CUDA_NVCC_DEPEND_REGENERATE}")
-  # No incoming dependencies, so we need to generate them.  Make the
-  # output depend on the dependency file itself, which should cause the
-  # rule to re-run.
-  if(CUDA_NVCC_DEPEND_REGENERATE)
-    set(CUDA_NVCC_DEPEND ${dependency_file})
-    #message("Generating an empty dependency_file: ${dependency_file}")
-    file(WRITE ${dependency_file} "#FindCUDA.cmake generated file.  Do not edit.\n")
-  endif()
-
-endmacro()
-
-###############################################################################
-###############################################################################
-# Setup variables' defaults
-###############################################################################
-###############################################################################
-
-# Allow the user to specify if the device code is supposed to be 32 or 64 bit.
-if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-  set(CUDA_64_BIT_DEVICE_CODE_DEFAULT ON)
-else()
-  set(CUDA_64_BIT_DEVICE_CODE_DEFAULT OFF)
-endif()
-option(CUDA_64_BIT_DEVICE_CODE "Compile device code in 64 bit mode" ${CUDA_64_BIT_DEVICE_CODE_DEFAULT})
-
-# Attach the build rule to the source file in VS.  This option
-option(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE "Attach the build rule to the CUDA source file.  Enable only when the CUDA source file is added to at most one target." ON)
-
-# Prints out extra information about the cuda file during compilation
-option(CUDA_BUILD_CUBIN "Generate and parse .cubin files in Device mode." OFF)
-
-# Set whether we are using emulation or device mode.
-option(CUDA_BUILD_EMULATION "Build in Emulation mode" OFF)
-
-# Where to put the generated output.
-set(CUDA_GENERATED_OUTPUT_DIR "" CACHE PATH "Directory to put all the output files.  If blank it will default to the CMAKE_CURRENT_BINARY_DIR")
-
-# Parse HOST_COMPILATION mode.
-option(CUDA_HOST_COMPILATION_CPP "Generated file extension" ON)
-
-# Extra user settable flags
-set(CUDA_NVCC_FLAGS "" CACHE STRING "Semi-colon delimit multiple arguments.")
-
-if(CMAKE_GENERATOR MATCHES "Visual Studio")
-  set(CUDA_HOST_COMPILER "$(VCInstallDir)bin" CACHE FILEPATH "Host side compiler used by NVCC")
-else()
-  set(CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}" CACHE FILEPATH "Host side compiler used by NVCC")
-endif()
-
-# Propagate the host flags to the host compiler via -Xcompiler
-option(CUDA_PROPAGATE_HOST_FLAGS "Propage C/CXX_FLAGS and friends to the host compiler via -Xcompile" ON)
-
-# Enable CUDA_SEPARABLE_COMPILATION
-option(CUDA_SEPARABLE_COMPILATION "Compile CUDA objects with separable compilation enabled.  Requires CUDA 5.0+" OFF)
-
-# Specifies whether the commands used when compiling the .cu file will be printed out.
-option(CUDA_VERBOSE_BUILD "Print out the commands run while compiling the CUDA source file.  With the Makefile generator this defaults to VERBOSE variable specified on the command line, but can be forced on with this option." OFF)
-
-mark_as_advanced(
-  CUDA_64_BIT_DEVICE_CODE
-  CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE
-  CUDA_GENERATED_OUTPUT_DIR
-  CUDA_HOST_COMPILATION_CPP
-  CUDA_NVCC_FLAGS
-  CUDA_PROPAGATE_HOST_FLAGS
-  )
-
-# Makefile and similar generators don't define CMAKE_CONFIGURATION_TYPES, so we
-# need to add another entry for the CMAKE_BUILD_TYPE.  We also need to add the
-# standerd set of 4 build types (Debug, MinSizeRel, Release, and RelWithDebInfo)
-# for completeness.  We need run this loop in order to accomodate the addition
-# of extra configuration types.  Duplicate entries will be removed by
-# REMOVE_DUPLICATES.
-set(CUDA_configuration_types ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} Debug MinSizeRel Release RelWithDebInfo)
-list(REMOVE_DUPLICATES CUDA_configuration_types)
-foreach(config ${CUDA_configuration_types})
-    string(TOUPPER ${config} config_upper)
-    set(CUDA_NVCC_FLAGS_${config_upper} "" CACHE STRING "Semi-colon delimit multiple arguments.")
-    mark_as_advanced(CUDA_NVCC_FLAGS_${config_upper})
-endforeach()
-
-###############################################################################
-###############################################################################
-# Locate CUDA, Set Build Type, etc.
-###############################################################################
-###############################################################################
-
-macro(cuda_unset_include_and_libraries)
-  unset(CUDA_TOOLKIT_INCLUDE CACHE)
-  unset(CUDA_CUDART_LIBRARY CACHE)
-  unset(CUDA_CUDA_LIBRARY CACHE)
-  # Make sure you run this before you unset CUDA_VERSION.
-  if(CUDA_VERSION VERSION_EQUAL "3.0")
-    # This only existed in the 3.0 version of the CUDA toolkit
-    unset(CUDA_CUDARTEMU_LIBRARY CACHE)
-  endif()
-  unset(CUDA_cupti_LIBRARY CACHE)
-  unset(CUDA_cublas_LIBRARY CACHE)
-  unset(CUDA_cublasemu_LIBRARY CACHE)
-  unset(CUDA_cufft_LIBRARY CACHE)
-  unset(CUDA_cufftemu_LIBRARY CACHE)
-  unset(CUDA_curand_LIBRARY CACHE)
-  unset(CUDA_cusparse_LIBRARY CACHE)
-  unset(CUDA_npp_LIBRARY CACHE)
-  unset(CUDA_nppc_LIBRARY CACHE)
-  unset(CUDA_nppi_LIBRARY CACHE)
-  unset(CUDA_npps_LIBRARY CACHE)
-  unset(CUDA_nvcuvenc_LIBRARY CACHE)
-  unset(CUDA_nvcuvid_LIBRARY CACHE)
-endmacro()
-
-# Check to see if the CUDA_TOOLKIT_ROOT_DIR and CUDA_SDK_ROOT_DIR have changed,
-# if they have then clear the cache variables, so that will be detected again.
-if(NOT "${CUDA_TOOLKIT_ROOT_DIR}" STREQUAL "${CUDA_TOOLKIT_ROOT_DIR_INTERNAL}")
-  unset(CUDA_TOOLKIT_TARGET_DIR CACHE)
-  unset(CUDA_NVCC_EXECUTABLE CACHE)
-  unset(CUDA_VERSION CACHE)
-  cuda_unset_include_and_libraries()
-endif()
-
-if(NOT "${CUDA_TOOLKIT_TARGET_DIR}" STREQUAL "${CUDA_TOOLKIT_TARGET_DIR_INTERNAL}")
-  cuda_unset_include_and_libraries()
-endif()
-
-if(NOT "${CUDA_SDK_ROOT_DIR}" STREQUAL "${CUDA_SDK_ROOT_DIR_INTERNAL}")
-  # No specific variables to catch.  Use this kind of code before calling
-  # find_package(CUDA) to clean up any variables that may depend on this path.
-
-  #   unset(MY_SPECIAL_CUDA_SDK_INCLUDE_DIR CACHE)
-  #   unset(MY_SPECIAL_CUDA_SDK_LIBRARY CACHE)
-endif()
-
-# Search for the cuda distribution.
-if(NOT CUDA_TOOLKIT_ROOT_DIR)
-
-  # Search in the CUDA_BIN_PATH first.
-  find_path(CUDA_TOOLKIT_ROOT_DIR
-    NAMES nvcc nvcc.exe
-    PATHS
-      ENV CUDA_PATH
-      ENV CUDA_BIN_PATH
-    PATH_SUFFIXES bin bin64
-    DOC "Toolkit location."
-    NO_DEFAULT_PATH
-    )
-  # Now search default paths
-  find_path(CUDA_TOOLKIT_ROOT_DIR
-    NAMES nvcc nvcc.exe
-    PATHS /usr/local/bin
-          /usr/local/cuda/bin
-    DOC "Toolkit location."
-    )
-
-  if (CUDA_TOOLKIT_ROOT_DIR)
-    string(REGEX REPLACE "[/\\\\]?bin[64]*[/\\\\]?$" "" CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR})
-    # We need to force this back into the cache.
-    set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR} CACHE PATH "Toolkit location." FORCE)
-  endif()
-  if (NOT EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
-    if(CUDA_FIND_REQUIRED)
-      message(FATAL_ERROR "Specify CUDA_TOOLKIT_ROOT_DIR")
-    elseif(NOT CUDA_FIND_QUIETLY)
-      message("CUDA_TOOLKIT_ROOT_DIR not found or specified")
-    endif()
-  endif ()
-endif ()
-
-# CUDA_NVCC_EXECUTABLE
-find_program(CUDA_NVCC_EXECUTABLE
-  NAMES nvcc
-  PATHS "${CUDA_TOOLKIT_ROOT_DIR}"
-  ENV CUDA_PATH
-  ENV CUDA_BIN_PATH
-  PATH_SUFFIXES bin bin64
-  NO_DEFAULT_PATH
-  )
-# Search default search paths, after we search our own set of paths.
-find_program(CUDA_NVCC_EXECUTABLE nvcc)
-mark_as_advanced(CUDA_NVCC_EXECUTABLE)
-
-if(CUDA_NVCC_EXECUTABLE AND NOT CUDA_VERSION)
-  # Compute the version.
-  execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
-  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${NVCC_OUT})
-  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${NVCC_OUT})
-  set(CUDA_VERSION "${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}" CACHE STRING "Version of CUDA as computed from nvcc.")
-  mark_as_advanced(CUDA_VERSION)
-else()
-  # Need to set these based off of the cached value
-  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR "${CUDA_VERSION}")
-  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR "${CUDA_VERSION}")
-endif()
-
-# Always set this convenience variable
-set(CUDA_VERSION_STRING "${CUDA_VERSION}")
-
-# Support for arm cross compilation with CUDA 5.5
-set(__cuda_toolkit_target_dir_initial "${CUDA_TOOLKIT_ROOT_DIR}")
-if(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
-  if(ANDROID AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-androideabi")
-    set(__cuda_toolkit_target_dir_initial "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-androideabi")
-  elseif(EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
-    set(__cuda_toolkit_target_dir_initial "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
-  endif()
-endif()
-set(CUDA_TOOLKIT_TARGET_DIR "${__cuda_toolkit_target_dir_initial}" CACHE PATH "Toolkit target location.")
-mark_as_advanced(CUDA_TOOLKIT_TARGET_DIR)
-
-# Target CPU architecture
-if(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
-  set(_cuda_target_cpu_arch_initial "ARM")
-else()
-  set(_cuda_target_cpu_arch_initial "")
-endif()
-set(CUDA_TARGET_CPU_ARCH ${_cuda_target_cpu_arch_initial} CACHE STRING "Specify the name of the class of CPU architecture for which the input files must be compiled.")
-mark_as_advanced(CUDA_TARGET_CPU_ARCH)
-
-# CUDA_TOOLKIT_INCLUDE
-find_path(CUDA_TOOLKIT_INCLUDE
-  device_functions.h # Header included in toolkit
-  PATHS "${CUDA_TOOLKIT_TARGET_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}"
-  ENV CUDA_PATH
-  ENV CUDA_INC_PATH
-  PATH_SUFFIXES include
-  NO_DEFAULT_PATH
-  )
-# Search default search paths, after we search our own set of paths.
-find_path(CUDA_TOOLKIT_INCLUDE device_functions.h)
-mark_as_advanced(CUDA_TOOLKIT_INCLUDE)
-
-# Set the user list of include dir to nothing to initialize it.
-set (CUDA_NVCC_INCLUDE_ARGS_USER "")
-set (CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
-
-macro(cuda_find_library_local_first_with_path_ext _var _names _doc _path_ext )
-  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-    # CUDA 3.2+ on Windows moved the library directories, so we need the new
-    # and old paths.
-    set(_cuda_64bit_lib_dir "${_path_ext}lib/x64" "${_path_ext}lib64" "${_path_ext}libx64" )
-  endif()
-  # CUDA 3.2+ on Windows moved the library directories, so we need to new
-  # (lib/Win32) and the old path (lib).
-  find_library(${_var}
-    NAMES ${_names}
-    PATHS "${CUDA_TOOLKIT_TARGET_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}"
-    ENV CUDA_PATH
-    ENV CUDA_LIB_PATH
-    PATH_SUFFIXES ${_cuda_64bit_lib_dir} "${_path_ext}lib/Win32" "${_path_ext}lib" "${_path_ext}libWin32"
-    DOC ${_doc}
-    NO_DEFAULT_PATH
-    )
-  # Search default search paths, after we search our own set of paths.
-  find_library(${_var}
-    NAMES ${_names}
-    PATHS "/usr/lib/nvidia-current"
-    DOC ${_doc}
-    )
-endmacro()
-
-macro(cuda_find_library_local_first _var _names _doc)
-  cuda_find_library_local_first_with_path_ext( "${_var}" "${_names}" "${_doc}" "" )
-endmacro()
-
-macro(find_library_local_first _var _names _doc )
-  cuda_find_library_local_first( "${_var}" "${_names}" "${_doc}" "" )
-endmacro()
-
-
-# CUDA_LIBRARIES
-cuda_find_library_local_first(CUDA_CUDART_LIBRARY cudart "\"cudart\" library")
-if(CUDA_VERSION VERSION_EQUAL "3.0")
-  # The cudartemu library only existed for the 3.0 version of CUDA.
-  cuda_find_library_local_first(CUDA_CUDARTEMU_LIBRARY cudartemu "\"cudartemu\" library")
-  mark_as_advanced(
-    CUDA_CUDARTEMU_LIBRARY
-    )
-endif()
-
-# CUPTI library showed up in cuda toolkit 4.0
-if(NOT CUDA_VERSION VERSION_LESS "4.0")
-  cuda_find_library_local_first_with_path_ext(CUDA_cupti_LIBRARY cupti "\"cupti\" library" "extras/CUPTI/")
-  mark_as_advanced(CUDA_cupti_LIBRARY)
-endif()
-
-# If we are using emulation mode and we found the cudartemu library then use
-# that one instead of cudart.
-if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
-  set(CUDA_LIBRARIES ${CUDA_CUDARTEMU_LIBRARY})
-else()
-  set(CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY})
-endif()
-# if(APPLE)
-#   # We need to add the path to cudart to the linker using rpath, since the
-#   # library name for the cuda libraries is prepended with @rpath.
-#   if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
-#     get_filename_component(_cuda_path_to_cudart "${CUDA_CUDARTEMU_LIBRARY}" PATH)
-#   else()
-#     get_filename_component(_cuda_path_to_cudart "${CUDA_CUDART_LIBRARY}" PATH)
-#   endif()
-#   if(_cuda_path_to_cudart)
-#     list(APPEND CUDA_LIBRARIES -Wl,-rpath "-Wl,${_cuda_path_to_cudart}")
-#   endif()
-# endif()
-
-# 1.1 toolkit on linux doesn't appear to have a separate library on
-# some platforms.
-cuda_find_library_local_first(CUDA_CUDA_LIBRARY cuda "\"cuda\" library (older versions only).")
-
-mark_as_advanced(
-  CUDA_CUDA_LIBRARY
-  CUDA_CUDART_LIBRARY
-  )
-
-#######################
-# Look for some of the toolkit helper libraries
-macro(FIND_CUDA_HELPER_LIBS _name)
-  cuda_find_library_local_first(CUDA_${_name}_LIBRARY ${_name} "\"${_name}\" library")
-  mark_as_advanced(CUDA_${_name}_LIBRARY)
-endmacro()
-
-#######################
-# Disable emulation for v3.1 onward
-if(CUDA_VERSION VERSION_GREATER "3.0")
-  if(CUDA_BUILD_EMULATION)
-    message(FATAL_ERROR "CUDA_BUILD_EMULATION is not supported in version 3.1 and onwards.  You must disable it to proceed.  You have version ${CUDA_VERSION}.")
-  endif()
-endif()
-
-# Search for additional CUDA toolkit libraries.
-if(CUDA_VERSION VERSION_LESS "3.1")
-  # Emulation libraries aren't available in version 3.1 onward.
-  find_cuda_helper_libs(cufftemu)
-  find_cuda_helper_libs(cublasemu)
-endif()
-find_cuda_helper_libs(cufft)
-find_cuda_helper_libs(cublas)
-if(NOT CUDA_VERSION VERSION_LESS "3.2")
-  # cusparse showed up in version 3.2
-  find_cuda_helper_libs(cusparse)
-  find_cuda_helper_libs(curand)
-  if (WIN32)
-    find_cuda_helper_libs(nvcuvenc)
-    find_cuda_helper_libs(nvcuvid)
-  endif()
-endif()
-if(CUDA_VERSION VERSION_GREATER "5.0")
-  # In CUDA 5.5 NPP was splitted onto 3 separate libraries.
-  find_cuda_helper_libs(nppc)
-  find_cuda_helper_libs(nppi)
-  find_cuda_helper_libs(npps)
-  set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}")
-elseif(NOT CUDA_VERSION VERSION_LESS "4.0")
-  find_cuda_helper_libs(npp)
-endif()
-
-if (CUDA_BUILD_EMULATION)
-  set(CUDA_CUFFT_LIBRARIES ${CUDA_cufftemu_LIBRARY})
-  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublasemu_LIBRARY})
-else()
-  set(CUDA_CUFFT_LIBRARIES ${CUDA_cufft_LIBRARY})
-  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY})
-endif()
-
-########################
-# Look for the SDK stuff.  As of CUDA 3.0 NVSDKCUDA_ROOT has been replaced with
-# NVSDKCOMPUTE_ROOT with the old CUDA C contents moved into the C subdirectory
-find_path(CUDA_SDK_ROOT_DIR common/inc/cutil.h
- HINTS
-  "$ENV{NVSDKCOMPUTE_ROOT}/C"
-  ENV NVSDKCUDA_ROOT
-  "[HKEY_LOCAL_MACHINE\\SOFTWARE\\NVIDIA Corporation\\Installed Products\\NVIDIA SDK 10\\Compute;InstallDir]"
- PATHS
-  "/Developer/GPU\ Computing/C"
-  )
-
-# Keep the CUDA_SDK_ROOT_DIR first in order to be able to override the
-# environment variables.
-set(CUDA_SDK_SEARCH_PATH
-  "${CUDA_SDK_ROOT_DIR}"
-  "${CUDA_TOOLKIT_ROOT_DIR}/local/NVSDK0.2"
-  "${CUDA_TOOLKIT_ROOT_DIR}/NVSDK0.2"
-  "${CUDA_TOOLKIT_ROOT_DIR}/NV_CUDA_SDK"
-  "$ENV{HOME}/NVIDIA_CUDA_SDK"
-  "$ENV{HOME}/NVIDIA_CUDA_SDK_MACOSX"
-  "/Developer/CUDA"
-  )
-
-# Example of how to find an include file from the CUDA_SDK_ROOT_DIR
-
-# find_path(CUDA_CUT_INCLUDE_DIR
-#   cutil.h
-#   PATHS ${CUDA_SDK_SEARCH_PATH}
-#   PATH_SUFFIXES "common/inc"
-#   DOC "Location of cutil.h"
-#   NO_DEFAULT_PATH
-#   )
-# # Now search system paths
-# find_path(CUDA_CUT_INCLUDE_DIR cutil.h DOC "Location of cutil.h")
-
-# mark_as_advanced(CUDA_CUT_INCLUDE_DIR)
-
-
-# Example of how to find a library in the CUDA_SDK_ROOT_DIR
-
-# # cutil library is called cutil64 for 64 bit builds on windows.  We don't want
-# # to get these confused, so we are setting the name based on the word size of
-# # the build.
-
-# if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-#   set(cuda_cutil_name cutil64)
-# else()
-#   set(cuda_cutil_name cutil32)
-# endif()
-
-# find_library(CUDA_CUT_LIBRARY
-#   NAMES cutil ${cuda_cutil_name}
-#   PATHS ${CUDA_SDK_SEARCH_PATH}
-#   # The new version of the sdk shows up in common/lib, but the old one is in lib
-#   PATH_SUFFIXES "common/lib" "lib"
-#   DOC "Location of cutil library"
-#   NO_DEFAULT_PATH
-#   )
-# # Now search system paths
-# find_library(CUDA_CUT_LIBRARY NAMES cutil ${cuda_cutil_name} DOC "Location of cutil library")
-# mark_as_advanced(CUDA_CUT_LIBRARY)
-# set(CUDA_CUT_LIBRARIES ${CUDA_CUT_LIBRARY})
-
-
-
-#############################
-# Check for required components
-set(CUDA_FOUND TRUE)
-
-set(CUDA_TOOLKIT_ROOT_DIR_INTERNAL "${CUDA_TOOLKIT_ROOT_DIR}" CACHE INTERNAL
-  "This is the value of the last time CUDA_TOOLKIT_ROOT_DIR was set successfully." FORCE)
-set(CUDA_TOOLKIT_TARGET_DIR_INTERNAL "${CUDA_TOOLKIT_TARGET_DIR}" CACHE INTERNAL
-  "This is the value of the last time CUDA_TOOLKIT_TARGET_DIR was set successfully." FORCE)
-set(CUDA_SDK_ROOT_DIR_INTERNAL "${CUDA_SDK_ROOT_DIR}" CACHE INTERNAL
-  "This is the value of the last time CUDA_SDK_ROOT_DIR was set successfully." FORCE)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(CUDA
-  REQUIRED_VARS
-    CUDA_TOOLKIT_ROOT_DIR
-    CUDA_NVCC_EXECUTABLE
-    CUDA_INCLUDE_DIRS
-    CUDA_CUDART_LIBRARY
-  VERSION_VAR
-    CUDA_VERSION
-  )
-
-
-
-###############################################################################
-###############################################################################
-# Macros
-###############################################################################
-###############################################################################
-
-###############################################################################
-# Add include directories to pass to the nvcc command.
-macro(CUDA_INCLUDE_DIRECTORIES)
-  foreach(dir ${ARGN})
-    list(APPEND CUDA_NVCC_INCLUDE_ARGS_USER -I${dir})
-  endforeach()
-endmacro()
-
-
-##############################################################################
-cuda_find_helper_file(parse_cubin cmake)
-cuda_find_helper_file(make2cmake cmake)
-cuda_find_helper_file(run_nvcc cmake)
-
-##############################################################################
-# Separate the OPTIONS out from the sources
-#
-macro(CUDA_GET_SOURCES_AND_OPTIONS _sources _cmake_options _options)
-  set( ${_sources} )
-  set( ${_cmake_options} )
-  set( ${_options} )
-  set( _found_options FALSE )
-  foreach(arg ${ARGN})
-    if(arg STREQUAL "OPTIONS")
-      set( _found_options TRUE )
-    elseif(
-        arg STREQUAL "WIN32" OR
-        arg STREQUAL "MACOSX_BUNDLE" OR
-        arg STREQUAL "EXCLUDE_FROM_ALL" OR
-        arg STREQUAL "STATIC" OR
-        arg STREQUAL "SHARED" OR
-        arg STREQUAL "MODULE"
-        )
-      list(APPEND ${_cmake_options} ${arg})
-    else()
-      if ( _found_options )
-        list(APPEND ${_options} ${arg})
-      else()
-        # Assume this is a file
-        list(APPEND ${_sources} ${arg})
-      endif()
-    endif()
-  endforeach()
-endmacro()
-
-##############################################################################
-# Parse the OPTIONS from ARGN and set the variables prefixed by _option_prefix
-#
-macro(CUDA_PARSE_NVCC_OPTIONS _option_prefix)
-  set( _found_config )
-  foreach(arg ${ARGN})
-    # Determine if we are dealing with a perconfiguration flag
-    foreach(config ${CUDA_configuration_types})
-      string(TOUPPER ${config} config_upper)
-      if (arg STREQUAL "${config_upper}")
-        set( _found_config _${arg})
-        # Set arg to nothing to keep it from being processed further
-        set( arg )
-      endif()
-    endforeach()
-
-    if ( arg )
-      list(APPEND ${_option_prefix}${_found_config} "${arg}")
-    endif()
-  endforeach()
-endmacro()
-
-##############################################################################
-# Helper to add the include directory for CUDA only once
-function(CUDA_ADD_CUDA_INCLUDE_ONCE)
-  get_directory_property(_include_directories INCLUDE_DIRECTORIES)
-  set(_add TRUE)
-  if(_include_directories)
-    foreach(dir ${_include_directories})
-      if("${dir}" STREQUAL "${CUDA_INCLUDE_DIRS}")
-        set(_add FALSE)
-      endif()
-    endforeach()
-  endif()
-  if(_add)
-    include_directories(${CUDA_INCLUDE_DIRS})
-  endif()
-endfunction()
-
-function(CUDA_BUILD_SHARED_LIBRARY shared_flag)
-  set(cmake_args ${ARGN})
-  # If SHARED, MODULE, or STATIC aren't already in the list of arguments, then
-  # add SHARED or STATIC based on the value of BUILD_SHARED_LIBS.
-  list(FIND cmake_args SHARED _cuda_found_SHARED)
-  list(FIND cmake_args MODULE _cuda_found_MODULE)
-  list(FIND cmake_args STATIC _cuda_found_STATIC)
-  if( _cuda_found_SHARED GREATER -1 OR
-      _cuda_found_MODULE GREATER -1 OR
-      _cuda_found_STATIC GREATER -1)
-    set(_cuda_build_shared_libs)
-  else()
-    if (BUILD_SHARED_LIBS)
-      set(_cuda_build_shared_libs SHARED)
-    else()
-      set(_cuda_build_shared_libs STATIC)
-    endif()
-  endif()
-  set(${shared_flag} ${_cuda_build_shared_libs} PARENT_SCOPE)
-endfunction()
-
-##############################################################################
-# Helper to avoid clashes of files with the same basename but different paths.
-# This doesn't attempt to do exactly what CMake internals do, which is to only
-# add this path when there is a conflict, since by the time a second collision
-# in names is detected it's already too late to fix the first one.  For
-# consistency sake the relative path will be added to all files.
-function(CUDA_COMPUTE_BUILD_PATH path build_path)
-  #message("CUDA_COMPUTE_BUILD_PATH([${path}] ${build_path})")
-  # Only deal with CMake style paths from here on out
-  file(TO_CMAKE_PATH "${path}" bpath)
-  if (IS_ABSOLUTE "${bpath}")
-    # Absolute paths are generally unnessary, especially if something like
-    # file(GLOB_RECURSE) is used to pick up the files.
-
-    string(FIND "${bpath}" "${CMAKE_CURRENT_BINARY_DIR}" _binary_dir_pos)
-    if (_binary_dir_pos EQUAL 0)
-      file(RELATIVE_PATH bpath "${CMAKE_CURRENT_BINARY_DIR}" "${bpath}")
-    else()
-      file(RELATIVE_PATH bpath "${CMAKE_CURRENT_SOURCE_DIR}" "${bpath}")
-    endif()
-  endif()
-
-  # This recipie is from cmLocalGenerator::CreateSafeUniqueObjectFileName in the
-  # CMake source.
-
-  # Remove leading /
-  string(REGEX REPLACE "^[/]+" "" bpath "${bpath}")
-  # Avoid absolute paths by removing ':'
-  string(REPLACE ":" "_" bpath "${bpath}")
-  # Avoid relative paths that go up the tree
-  string(REPLACE "../" "__/" bpath "${bpath}")
-  # Avoid spaces
-  string(REPLACE " " "_" bpath "${bpath}")
-
-  # Strip off the filename.  I wait until here to do it, since removin the
-  # basename can make a path that looked like path/../basename turn into
-  # path/.. (notice the trailing slash).
-  get_filename_component(bpath "${bpath}" PATH)
-
-  set(${build_path} "${bpath}" PARENT_SCOPE)
-  #message("${build_path} = ${bpath}")
-endfunction()
-
-##############################################################################
-# This helper macro populates the following variables and setups up custom
-# commands and targets to invoke the nvcc compiler to generate C or PTX source
-# dependent upon the format parameter.  The compiler is invoked once with -M
-# to generate a dependency file and a second time with -cuda or -ptx to generate
-# a .cpp or .ptx file.
-# INPUT:
-#   cuda_target         - Target name
-#   format              - PTX or OBJ
-#   FILE1 .. FILEN      - The remaining arguments are the sources to be wrapped.
-#   OPTIONS             - Extra options to NVCC
-# OUTPUT:
-#   generated_files     - List of generated files
-##############################################################################
-##############################################################################
-
-macro(CUDA_WRAP_SRCS cuda_target format generated_files)
-
-  # If CMake doesn't support separable compilation, complain
-  if(CUDA_SEPARABLE_COMPILATION AND CMAKE_VERSION VERSION_LESS "2.8.10.1")
-    message(SEND_ERROR "CUDA_SEPARABLE_COMPILATION isn't supported for CMake versions less than 2.8.10.1")
-  endif()
-
-  # Set up all the command line flags here, so that they can be overridden on a per target basis.
-
-  set(nvcc_flags "")
-
-  # Emulation if the card isn't present.
-  if (CUDA_BUILD_EMULATION)
-    # Emulation.
-    set(nvcc_flags ${nvcc_flags} --device-emulation -D_DEVICEEMU -g)
-  else()
-    # Device mode.  No flags necessary.
-  endif()
-
-  if(CUDA_HOST_COMPILATION_CPP)
-    set(CUDA_C_OR_CXX CXX)
-  else()
-    if(CUDA_VERSION VERSION_LESS "3.0")
-      set(nvcc_flags ${nvcc_flags} --host-compilation C)
-    else()
-      message(WARNING "--host-compilation flag is deprecated in CUDA version >= 3.0.  Removing --host-compilation C flag" )
-    endif()
-    set(CUDA_C_OR_CXX C)
-  endif()
-
-  set(generated_extension ${CMAKE_${CUDA_C_OR_CXX}_OUTPUT_EXTENSION})
-
-  if(CUDA_64_BIT_DEVICE_CODE)
-    set(nvcc_flags ${nvcc_flags} -m64)
-  else()
-    set(nvcc_flags ${nvcc_flags} -m32)
-  endif()
-
-  if(CUDA_TARGET_CPU_ARCH)
-    set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH}")
-  endif()
-
-  # This needs to be passed in at this stage, because VS needs to fill out the
-  # value of VCInstallDir from within VS.  Note that CCBIN is only used if
-  # -ccbin or --compiler-bindir isn't used and CUDA_HOST_COMPILER matches
-  # $(VCInstallDir)/bin.
-  if(CMAKE_GENERATOR MATCHES "Visual Studio")
-    set(ccbin_flags -D "\"CCBIN:PATH=$(VCInstallDir)bin\"" )
-  else()
-    set(ccbin_flags)
-  endif()
-
-  # Figure out which configure we will use and pass that in as an argument to
-  # the script.  We need to defer the decision until compilation time, because
-  # for VS projects we won't know if we are making a debug or release build
-  # until build time.
-  if(CMAKE_GENERATOR MATCHES "Visual Studio")
-    set( CUDA_build_configuration "$(ConfigurationName)" )
-  else()
-    set( CUDA_build_configuration "${CMAKE_BUILD_TYPE}")
-  endif()
-
-  # Initialize our list of includes with the user ones followed by the CUDA system ones.
-  set(CUDA_NVCC_INCLUDE_ARGS ${CUDA_NVCC_INCLUDE_ARGS_USER} "-I${CUDA_INCLUDE_DIRS}")
-  # Get the include directories for this directory and use them for our nvcc command.
-  # Remove duplicate entries which may be present since include_directories
-  # in CMake >= 2.8.8 does not remove them.
-  get_directory_property(CUDA_NVCC_INCLUDE_DIRECTORIES INCLUDE_DIRECTORIES)
-  list(REMOVE_DUPLICATES CUDA_NVCC_INCLUDE_DIRECTORIES)
-  if(CUDA_NVCC_INCLUDE_DIRECTORIES)
-    foreach(dir ${CUDA_NVCC_INCLUDE_DIRECTORIES})
-      list(APPEND CUDA_NVCC_INCLUDE_ARGS -I${dir})
-    endforeach()
-  endif()
-
-  # Reset these variables
-  set(CUDA_WRAP_OPTION_NVCC_FLAGS)
-  foreach(config ${CUDA_configuration_types})
-    string(TOUPPER ${config} config_upper)
-    set(CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper})
-  endforeach()
-
-  CUDA_GET_SOURCES_AND_OPTIONS(_cuda_wrap_sources _cuda_wrap_cmake_options _cuda_wrap_options ${ARGN})
-  CUDA_PARSE_NVCC_OPTIONS(CUDA_WRAP_OPTION_NVCC_FLAGS ${_cuda_wrap_options})
-
-  # Figure out if we are building a shared library.  BUILD_SHARED_LIBS is
-  # respected in CUDA_ADD_LIBRARY.
-  set(_cuda_build_shared_libs FALSE)
-  # SHARED, MODULE
-  list(FIND _cuda_wrap_cmake_options SHARED _cuda_found_SHARED)
-  list(FIND _cuda_wrap_cmake_options MODULE _cuda_found_MODULE)
-  if(_cuda_found_SHARED GREATER -1 OR _cuda_found_MODULE GREATER -1)
-    set(_cuda_build_shared_libs TRUE)
-  endif()
-  # STATIC
-  list(FIND _cuda_wrap_cmake_options STATIC _cuda_found_STATIC)
-  if(_cuda_found_STATIC GREATER -1)
-    set(_cuda_build_shared_libs FALSE)
-  endif()
-
-  # CUDA_HOST_FLAGS
-  if(_cuda_build_shared_libs)
-    # If we are setting up code for a shared library, then we need to add extra flags for
-    # compiling objects for shared libraries.
-    set(CUDA_HOST_SHARED_FLAGS ${CMAKE_SHARED_LIBRARY_${CUDA_C_OR_CXX}_FLAGS})
-  else()
-    set(CUDA_HOST_SHARED_FLAGS)
-  endif()
-  # Only add the CMAKE_{C,CXX}_FLAGS if we are propagating host flags.  We
-  # always need to set the SHARED_FLAGS, though.
-  if(CUDA_PROPAGATE_HOST_FLAGS)
-    set(_cuda_host_flags "set(CMAKE_HOST_FLAGS ${CMAKE_${CUDA_C_OR_CXX}_FLAGS} ${CUDA_HOST_SHARED_FLAGS})")
-  else()
-    set(_cuda_host_flags "set(CMAKE_HOST_FLAGS ${CUDA_HOST_SHARED_FLAGS})")
-  endif()
-
-  set(_cuda_nvcc_flags_config "# Build specific configuration flags")
-  # Loop over all the configuration types to generate appropriate flags for run_nvcc.cmake
-  foreach(config ${CUDA_configuration_types})
-    string(TOUPPER ${config} config_upper)
-    # CMAKE_FLAGS are strings and not lists.  By not putting quotes around CMAKE_FLAGS
-    # we convert the strings to lists (like we want).
-
-    if(CUDA_PROPAGATE_HOST_FLAGS)
-      # nvcc chokes on -g3 in versions previous to 3.0, so replace it with -g
-      set(_cuda_fix_g3 FALSE)
-
-      if(CMAKE_COMPILER_IS_GNUCC)
-        if (CUDA_VERSION VERSION_LESS  "3.0" OR
-            CUDA_VERSION VERSION_EQUAL "4.1" OR
-            CUDA_VERSION VERSION_EQUAL "4.2"
-            )
-          set(_cuda_fix_g3 TRUE)
-        endif()
-      endif()
-      if(_cuda_fix_g3)
-        string(REPLACE "-g3" "-g" _cuda_C_FLAGS "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}")
-      else()
-        set(_cuda_C_FLAGS "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}")
-      endif()
-
-      set(_cuda_host_flags "${_cuda_host_flags}\nset(CMAKE_HOST_FLAGS_${config_upper} ${_cuda_C_FLAGS})")
-    endif()
-
-    # Note that if we ever want CUDA_NVCC_FLAGS_<CONFIG> to be string (instead of a list
-    # like it is currently), we can remove the quotes around the
-    # ${CUDA_NVCC_FLAGS_${config_upper}} variable like the CMAKE_HOST_FLAGS_<CONFIG> variable.
-    set(_cuda_nvcc_flags_config "${_cuda_nvcc_flags_config}\nset(CUDA_NVCC_FLAGS_${config_upper} ${CUDA_NVCC_FLAGS_${config_upper}} ;; ${CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper}})")
-  endforeach()
-
-  # Get the list of definitions from the directory property
-  get_directory_property(CUDA_NVCC_DEFINITIONS COMPILE_DEFINITIONS)
-  if(CUDA_NVCC_DEFINITIONS)
-    foreach(_definition ${CUDA_NVCC_DEFINITIONS})
-      list(APPEND nvcc_flags "-D${_definition}")
-    endforeach()
-  endif()
-
-  if(_cuda_build_shared_libs)
-    list(APPEND nvcc_flags "-D${cuda_target}_EXPORTS")
-  endif()
-
-  # Reset the output variable
-  set(_cuda_wrap_generated_files "")
-
-  # Iterate over the macro arguments and create custom
-  # commands for all the .cu files.
-  foreach(file ${ARGN})
-    # Ignore any file marked as a HEADER_FILE_ONLY
-    get_source_file_property(_is_header ${file} HEADER_FILE_ONLY)
-    if(${file} MATCHES ".*\\.cu$" AND NOT _is_header)
-
-      # Allow per source file overrides of the format.
-      get_source_file_property(_cuda_source_format ${file} CUDA_SOURCE_PROPERTY_FORMAT)
-      if(NOT _cuda_source_format)
-        set(_cuda_source_format ${format})
-      endif()
-
-      if( ${_cuda_source_format} MATCHES "PTX" )
-        set( compile_to_ptx ON )
-      elseif( ${_cuda_source_format} MATCHES "OBJ")
-        set( compile_to_ptx OFF )
-      else()
-        message( FATAL_ERROR "Invalid format flag passed to CUDA_WRAP_SRCS for file '${file}': '${_cuda_source_format}'.  Use OBJ or PTX.")
-      endif()
-
-
-      if(compile_to_ptx)
-        # Don't use any of the host compilation flags for PTX targets.
-        set(CUDA_HOST_FLAGS)
-        set(CUDA_NVCC_FLAGS_CONFIG)
-      else()
-        set(CUDA_HOST_FLAGS ${_cuda_host_flags})
-        set(CUDA_NVCC_FLAGS_CONFIG ${_cuda_nvcc_flags_config})
-      endif()
-
-      # Determine output directory
-      cuda_compute_build_path("${file}" cuda_build_path)
-      set(cuda_compile_intermediate_directory "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${cuda_target}.dir/${cuda_build_path}")
-      if(CUDA_GENERATED_OUTPUT_DIR)
-        set(cuda_compile_output_dir "${CUDA_GENERATED_OUTPUT_DIR}")
-      else()
-        if ( compile_to_ptx )
-          set(cuda_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}")
-        else()
-          set(cuda_compile_output_dir "${cuda_compile_intermediate_directory}")
-        endif()
-      endif()
-
-      # Add a custom target to generate a c or ptx file. ######################
-
-      get_filename_component( basename ${file} NAME )
-      if( compile_to_ptx )
-        set(generated_file_path "${cuda_compile_output_dir}")
-        set(generated_file_basename "${cuda_target}_generated_${basename}.ptx")
-        set(format_flag "-ptx")
-        file(MAKE_DIRECTORY "${cuda_compile_output_dir}")
-      else()
-        set(generated_file_path "${cuda_compile_output_dir}/${CMAKE_CFG_INTDIR}")
-        set(generated_file_basename "${cuda_target}_generated_${basename}${generated_extension}")
-        if(CUDA_SEPARABLE_COMPILATION)
-          set(format_flag "-dc")
-        else()
-          set(format_flag "-c")
-        endif()
-      endif()
-
-      # Set all of our file names.  Make sure that whatever filenames that have
-      # generated_file_path in them get passed in through as a command line
-      # argument, so that the ${CMAKE_CFG_INTDIR} gets expanded at run time
-      # instead of configure time.
-      set(generated_file "${generated_file_path}/${generated_file_basename}")
-      set(cmake_dependency_file "${cuda_compile_intermediate_directory}/${generated_file_basename}.depend")
-      set(NVCC_generated_dependency_file "${cuda_compile_intermediate_directory}/${generated_file_basename}.NVCC-depend")
-      set(generated_cubin_file "${generated_file_path}/${generated_file_basename}.cubin.txt")
-      set(custom_target_script "${cuda_compile_intermediate_directory}/${generated_file_basename}.cmake")
-
-      # Setup properties for obj files:
-      if( NOT compile_to_ptx )
-        set_source_files_properties("${generated_file}"
-          PROPERTIES
-          EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked.
-          )
-      endif()
-
-      # Don't add CMAKE_CURRENT_SOURCE_DIR if the path is already an absolute path.
-      get_filename_component(file_path "${file}" PATH)
-      if(IS_ABSOLUTE "${file_path}")
-        set(source_file "${file}")
-      else()
-        set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
-      endif()
-
-      if( NOT compile_to_ptx AND CUDA_SEPARABLE_COMPILATION)
-        list(APPEND ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS "${generated_file}")
-      endif()
-
-      # Bring in the dependencies.  Creates a variable CUDA_NVCC_DEPEND #######
-      cuda_include_nvcc_dependencies(${cmake_dependency_file})
-
-      # Convience string for output ###########################################
-      if(CUDA_BUILD_EMULATION)
-        set(cuda_build_type "Emulation")
-      else()
-        set(cuda_build_type "Device")
-      endif()
-
-      # Build the NVCC made dependency file ###################################
-      set(build_cubin OFF)
-      if ( NOT CUDA_BUILD_EMULATION AND CUDA_BUILD_CUBIN )
-         if ( NOT compile_to_ptx )
-           set ( build_cubin ON )
-         endif()
-      endif()
-
-      # Configure the build script
-      configure_file("${CUDA_run_nvcc}" "${custom_target_script}" @ONLY)
-
-      # So if a user specifies the same cuda file as input more than once, you
-      # can have bad things happen with dependencies.  Here we check an option
-      # to see if this is the behavior they want.
-      if(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE)
-        set(main_dep MAIN_DEPENDENCY ${source_file})
-      else()
-        set(main_dep DEPENDS ${source_file})
-      endif()
-
-      if(CUDA_VERBOSE_BUILD)
-        set(verbose_output ON)
-      elseif(CMAKE_GENERATOR MATCHES "Makefiles")
-        set(verbose_output "$(VERBOSE)")
-      else()
-        set(verbose_output OFF)
-      endif()
-
-      # Create up the comment string
-      file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}")
-      if(compile_to_ptx)
-        set(cuda_build_comment_string "Building NVCC ptx file ${generated_file_relative_path}")
-      else()
-        set(cuda_build_comment_string "Building NVCC (${cuda_build_type}) object ${generated_file_relative_path}")
-      endif()
-
-      # Build the generated file and dependency file ##########################
-      add_custom_command(
-        OUTPUT ${generated_file}
-        # These output files depend on the source_file and the contents of cmake_dependency_file
-        ${main_dep}
-        DEPENDS ${CUDA_NVCC_DEPEND}
-        DEPENDS ${custom_target_script}
-        # Make sure the output directory exists before trying to write to it.
-        COMMAND ${CMAKE_COMMAND} -E make_directory "${generated_file_path}"
-        COMMAND ${CMAKE_COMMAND} ARGS
-          -D verbose:BOOL=${verbose_output}
-          ${ccbin_flags}
-          -D build_configuration:STRING=${CUDA_build_configuration}
-          -D "generated_file:STRING=${generated_file}"
-          -D "generated_cubin_file:STRING=${generated_cubin_file}"
-          -P "${custom_target_script}"
-        WORKING_DIRECTORY "${cuda_compile_intermediate_directory}"
-        COMMENT "${cuda_build_comment_string}"
-        )
-
-      # Make sure the build system knows the file is generated.
-      set_source_files_properties(${generated_file} PROPERTIES GENERATED TRUE)
-
-      list(APPEND _cuda_wrap_generated_files ${generated_file})
-
-      # Add the other files that we want cmake to clean on a cleanup ##########
-      list(APPEND CUDA_ADDITIONAL_CLEAN_FILES "${cmake_dependency_file}")
-      list(REMOVE_DUPLICATES CUDA_ADDITIONAL_CLEAN_FILES)
-      set(CUDA_ADDITIONAL_CLEAN_FILES ${CUDA_ADDITIONAL_CLEAN_FILES} CACHE INTERNAL "List of intermediate files that are part of the cuda dependency scanning.")
-
-    endif()
-  endforeach()
-
-  # Set the return parameter
-  set(${generated_files} ${_cuda_wrap_generated_files})
-endmacro()
-
-function(_cuda_get_important_host_flags important_flags flag_string)
-  if(CMAKE_GENERATOR MATCHES "Visual Studio")
-    string(REGEX MATCHALL "/M[DT][d]?" flags ${flag_string})
-    list(APPEND ${important_flags} ${flags})
-  else()
-    string(REGEX MATCHALL "-fPIC" flags ${flag_string})
-    list(APPEND ${important_flags} ${flags})
-  endif()
-  set(${important_flags} ${${important_flags}} PARENT_SCOPE)
-endfunction()
-
-###############################################################################
-###############################################################################
-# Separable Compilation Link
-###############################################################################
-###############################################################################
-
-# Compute the filename to be used by CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS
-function(CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME output_file_var cuda_target object_files)
-  if (object_files)
-    set(generated_extension ${CMAKE_${CUDA_C_OR_CXX}_OUTPUT_EXTENSION})
-    set(output_file "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${cuda_target}.dir/${CMAKE_CFG_INTDIR}/${cuda_target}_intermediate_link${generated_extension}")
-  else()
-    set(output_file)
-  endif()
-
-  set(${output_file_var} "${output_file}" PARENT_SCOPE)
-endfunction()
-
-# Setup the build rule for the separable compilation intermediate link file.
-function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options object_files)
-  if (object_files)
-
-    set_source_files_properties("${output_file}"
-      PROPERTIES
-      EXTERNAL_OBJECT TRUE # This is an object file not to be compiled, but only
-                           # be linked.
-      GENERATED TRUE       # This file is generated during the build
-      )
-
-    # For now we are ignoring all the configuration specific flags.
-    set(nvcc_flags)
-    CUDA_PARSE_NVCC_OPTIONS(nvcc_flags ${options})
-    if(CUDA_64_BIT_DEVICE_CODE)
-      list(APPEND nvcc_flags -m64)
-    else()
-      list(APPEND nvcc_flags -m32)
-    endif()
-    # If -ccbin, --compiler-bindir has been specified, don't do anything.  Otherwise add it here.
-    list( FIND nvcc_flags "-ccbin" ccbin_found0 )
-    list( FIND nvcc_flags "--compiler-bindir" ccbin_found1 )
-    if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-      list(APPEND nvcc_flags -ccbin "\"${CUDA_HOST_COMPILER}\"")
-    endif()
-    set(flags)
-    foreach(config ${CUDA_configuration_types})
-      string(TOUPPER ${config} config_upper)
-      set(important_host_flags)
-      _cuda_get_important_host_flags(important_host_flags ${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}})
-      foreach(f ${important_host_flags})
-        list(APPEND flags $<$<CONFIG:${config}>:-Xcompiler> $<$<CONFIG:${config}>:${f}>)
-      endforeach()
-    endforeach()
-    file(RELATIVE_PATH output_file_relative_path "${CMAKE_BINARY_DIR}" "${output_file}")
-
-    # Some generators don't handle the multiple levels of custom command
-    # dependencies correctly (obj1 depends on file1, obj2 depends on obj1), so
-    # we work around that issue by compiling the intermediate link object as a
-    # pre-link custom command in that situation.
-    set(do_obj_build_rule TRUE)
-    if (MSVC_VERSION GREATER 1599)
-      # VS 2010 and 2012 have this problem.  If future versions fix this issue,
-      # it should still work, it just won't be as nice as the other method.
-      set(do_obj_build_rule FALSE)
-    endif()
-
-    if (do_obj_build_rule)
-      add_custom_command(
-        OUTPUT ${output_file}
-        DEPENDS ${object_files}
-        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} -dlink ${object_files} -o ${output_file}
-        ${flags}
-        COMMENT "Building NVCC intermediate link file ${output_file_relative_path}"
-        )
-    else()
-      add_custom_command(
-        TARGET ${cuda_target}
-        PRE_LINK
-        COMMAND ${CMAKE_COMMAND} -E echo "Building NVCC intermediate link file ${output_file_relative_path}"
-        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} ${flags} -dlink ${object_files} -o "${output_file}"
-        )
-    endif()
- endif()
-endfunction()
-
-###############################################################################
-###############################################################################
-# ADD LIBRARY
-###############################################################################
-###############################################################################
-macro(CUDA_ADD_LIBRARY cuda_target)
-
-  CUDA_ADD_CUDA_INCLUDE_ONCE()
-
-  # Separate the sources from the options
-  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
-  CUDA_BUILD_SHARED_LIBRARY(_cuda_shared_flag ${ARGN})
-  # Create custom commands and targets for each file.
-  CUDA_WRAP_SRCS( ${cuda_target} OBJ _generated_files ${_sources}
-    ${_cmake_options} ${_cuda_shared_flag}
-    OPTIONS ${_options} )
-
-  # Compute the file name of the intermedate link file used for separable
-  # compilation.
-  CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
-
-  # Add the library.
-  add_library(${cuda_target} ${_cmake_options}
-    ${_generated_files}
-    ${_sources}
-    ${link_file}
-    )
-
-  # Add a link phase for the separable compilation if it has been enabled.  If
-  # it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS
-  # variable will have been defined.
-  CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
-
-  target_link_libraries(${cuda_target}
-    ${CUDA_LIBRARIES}
-    )
-
-  # We need to set the linker language based on what the expected generated file
-  # would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP.
-  set_target_properties(${cuda_target}
-    PROPERTIES
-    LINKER_LANGUAGE ${CUDA_C_OR_CXX}
-    )
-
-endmacro()
-
-
-###############################################################################
-###############################################################################
-# ADD EXECUTABLE
-###############################################################################
-###############################################################################
-macro(CUDA_ADD_EXECUTABLE cuda_target)
-
-  CUDA_ADD_CUDA_INCLUDE_ONCE()
-
-  # Separate the sources from the options
-  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
-  # Create custom commands and targets for each file.
-  CUDA_WRAP_SRCS( ${cuda_target} OBJ _generated_files ${_sources} OPTIONS ${_options} )
-
-  # Compute the file name of the intermedate link file used for separable
-  # compilation.
-  CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
-
-  # Add the library.
-  add_executable(${cuda_target} ${_cmake_options}
-    ${_generated_files}
-    ${_sources}
-    ${link_file}
-    )
-
-  # Add a link phase for the separable compilation if it has been enabled.  If
-  # it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS
-  # variable will have been defined.
-  CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
-
-  target_link_libraries(${cuda_target}
-    ${CUDA_LIBRARIES}
-    )
-
-  # We need to set the linker language based on what the expected generated file
-  # would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP.
-  set_target_properties(${cuda_target}
-    PROPERTIES
-    LINKER_LANGUAGE ${CUDA_C_OR_CXX}
-    )
-
-endmacro()
-
-
-###############################################################################
-###############################################################################
-# CUDA COMPILE
-###############################################################################
-###############################################################################
-macro(CUDA_COMPILE generated_files)
-
-  # Separate the sources from the options
-  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
-  # Create custom commands and targets for each file.
-  CUDA_WRAP_SRCS( cuda_compile OBJ _generated_files ${_sources} ${_cmake_options}
-    OPTIONS ${_options} )
-
-  set( ${generated_files} ${_generated_files})
-
-endmacro()
-
-
-###############################################################################
-###############################################################################
-# CUDA COMPILE PTX
-###############################################################################
-###############################################################################
-macro(CUDA_COMPILE_PTX generated_files)
-
-  # Separate the sources from the options
-  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
-  # Create custom commands and targets for each file.
-  CUDA_WRAP_SRCS( cuda_compile_ptx PTX _generated_files ${_sources} ${_cmake_options}
-    OPTIONS ${_options} )
-
-  set( ${generated_files} ${_generated_files})
-
-endmacro()
-
-###############################################################################
-###############################################################################
-# CUDA ADD CUFFT TO TARGET
-###############################################################################
-###############################################################################
-macro(CUDA_ADD_CUFFT_TO_TARGET target)
-  if (CUDA_BUILD_EMULATION)
-    target_link_libraries(${target} ${CUDA_cufftemu_LIBRARY})
-  else()
-    target_link_libraries(${target} ${CUDA_cufft_LIBRARY})
-  endif()
-endmacro()
-
-###############################################################################
-###############################################################################
-# CUDA ADD CUBLAS TO TARGET
-###############################################################################
-###############################################################################
-macro(CUDA_ADD_CUBLAS_TO_TARGET target)
-  if (CUDA_BUILD_EMULATION)
-    target_link_libraries(${target} ${CUDA_cublasemu_LIBRARY})
-  else()
-    target_link_libraries(${target} ${CUDA_cublas_LIBRARY})
-  endif()
-endmacro()
-
-###############################################################################
-###############################################################################
-# CUDA BUILD CLEAN TARGET
-###############################################################################
-###############################################################################
-macro(CUDA_BUILD_CLEAN_TARGET)
-  # Call this after you add all your CUDA targets, and you will get a convience
-  # target.  You should also make clean after running this target to get the
-  # build system to generate all the code again.
-
-  set(cuda_clean_target_name clean_cuda_depends)
-  if (CMAKE_GENERATOR MATCHES "Visual Studio")
-    string(TOUPPER ${cuda_clean_target_name} cuda_clean_target_name)
-  endif()
-  add_custom_target(${cuda_clean_target_name}
-    COMMAND ${CMAKE_COMMAND} -E remove ${CUDA_ADDITIONAL_CLEAN_FILES})
-
-  # Clear out the variable, so the next time we configure it will be empty.
-  # This is useful so that the files won't persist in the list after targets
-  # have been removed.
-  set(CUDA_ADDITIONAL_CLEAN_FILES "" CACHE INTERNAL "List of intermediate files that are part of the cuda dependency scanning.")
-endmacro()
diff --git a/src/cmake/FindCUDA/make2cmake.cmake b/src/cmake/FindCUDA/make2cmake.cmake
deleted file mode 100755
index 1b53d177..00000000
--- a/src/cmake/FindCUDA/make2cmake.cmake
+++ /dev/null
@@ -1,93 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#  Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  Copyright (c) 2007-2009
-#  Scientific Computing and Imaging Institute, University of Utah
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-
-#######################################################################
-# This converts a file written in makefile syntax into one that can be included
-# by CMake.
-
-file(READ ${input_file} depend_text)
-
-if (${depend_text} MATCHES ".+")
-
-  # message("FOUND DEPENDS")
-
-  # Remember, four backslashes is escaped to one backslash in the string.
-  string(REGEX REPLACE "\\\\ " " " depend_text ${depend_text})
-
-  # This works for the nvcc -M generated dependency files.
-  string(REGEX REPLACE "^.* : " "" depend_text ${depend_text})
-  string(REGEX REPLACE "[ \\\\]*\n" ";" depend_text ${depend_text})
-
-  set(dependency_list "")
-
-  foreach(file ${depend_text})
-
-    string(REGEX REPLACE "^ +" "" file ${file})
-
-    # OK, now if we had a UNC path, nvcc has a tendency to only output the first '/'
-    # instead of '//'.  Here we will test to see if the file exists, if it doesn't then
-    # try to prepend another '/' to the path and test again.  If it still fails remove the
-    # path.
-
-    if(NOT EXISTS "${file}")
-      if (EXISTS "/${file}")
-        set(file "/${file}")
-      else()
-        message(WARNING " Removing non-existent dependency file: ${file}")
-        set(file "")
-      endif()
-    endif()
-
-    if(NOT IS_DIRECTORY "${file}")
-      # If softlinks start to matter, we should change this to REALPATH.  For now we need
-      # to flatten paths, because nvcc can generate stuff like /bin/../include instead of
-      # just /include.
-      get_filename_component(file_absolute "${file}" ABSOLUTE)
-      list(APPEND dependency_list "${file_absolute}")
-    endif()
-
-  endforeach()
-
-else()
-  # message("FOUND NO DEPENDS")
-endif()
-
-# Remove the duplicate entries and sort them.
-list(REMOVE_DUPLICATES dependency_list)
-list(SORT dependency_list)
-
-foreach(file ${dependency_list})
-  set(cuda_nvcc_depend "${cuda_nvcc_depend} \"${file}\"\n")
-endforeach()
-
-file(WRITE ${output_file} "# Generated by: make2cmake.cmake\nSET(CUDA_NVCC_DEPEND\n ${cuda_nvcc_depend})\n\n")
diff --git a/src/cmake/FindCUDA/parse_cubin.cmake b/src/cmake/FindCUDA/parse_cubin.cmake
deleted file mode 100755
index e1905cfc..00000000
--- a/src/cmake/FindCUDA/parse_cubin.cmake
+++ /dev/null
@@ -1,110 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#  Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  Copyright (c) 2007-2009
-#  Scientific Computing and Imaging Institute, University of Utah
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-
-#######################################################################
-# Parses a .cubin file produced by nvcc and reports statistics about the file.
-
-
-file(READ ${input_file} file_text)
-
-if (${file_text} MATCHES ".+")
-
-  # Remember, four backslashes is escaped to one backslash in the string.
-  string(REGEX REPLACE ";" "\\\\;" file_text ${file_text})
-  string(REGEX REPLACE "\ncode" ";code" file_text ${file_text})
-
-  list(LENGTH file_text len)
-
-  foreach(line ${file_text})
-
-    # Only look at "code { }" blocks.
-    if(line MATCHES "^code")
-
-      # Break into individual lines.
-      string(REGEX REPLACE "\n" ";" line ${line})
-
-      foreach(entry ${line})
-
-        # Extract kernel names.
-        if (${entry} MATCHES "[^g]name = ([^ ]+)")
-          string(REGEX REPLACE ".* = ([^ ]+)" "\\1" entry ${entry})
-
-          # Check to see if the kernel name starts with "_"
-          set(skip FALSE)
-          # if (${entry} MATCHES "^_")
-            # Skip the rest of this block.
-            # message("Skipping ${entry}")
-            # set(skip TRUE)
-          # else ()
-            message("Kernel:    ${entry}")
-          # endif ()
-
-        endif()
-
-        # Skip the rest of the block if necessary
-        if(NOT skip)
-
-          # Registers
-          if (${entry} MATCHES "reg([ ]+)=([ ]+)([^ ]+)")
-            string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
-            message("Registers: ${entry}")
-          endif()
-
-          # Local memory
-          if (${entry} MATCHES "lmem([ ]+)=([ ]+)([^ ]+)")
-            string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
-            message("Local:     ${entry}")
-          endif()
-
-          # Shared memory
-          if (${entry} MATCHES "smem([ ]+)=([ ]+)([^ ]+)")
-            string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
-            message("Shared:    ${entry}")
-          endif()
-
-          if (${entry} MATCHES "^}")
-            message("")
-          endif()
-
-        endif()
-
-
-      endforeach()
-
-    endif()
-
-  endforeach()
-
-else()
-  # message("FOUND NO DEPENDS")
-endif()
diff --git a/src/cmake/FindCUDA/run_nvcc.cmake b/src/cmake/FindCUDA/run_nvcc.cmake
deleted file mode 100755
index f0aac848..00000000
--- a/src/cmake/FindCUDA/run_nvcc.cmake
+++ /dev/null
@@ -1,288 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "@CMAKE_COMMAND@") # path
-set(source_file "@source_file@") # path
-set(NVCC_generated_dependency_file "@NVCC_generated_dependency_file@") # path
-set(cmake_dependency_file "@cmake_dependency_file@") # path
-set(CUDA_make2cmake "@CUDA_make2cmake@") # path
-set(CUDA_parse_cubin "@CUDA_parse_cubin@") # path
-set(build_cubin @build_cubin@) # bool
-set(CUDA_HOST_COMPILER "@CUDA_HOST_COMPILER@") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "@generated_file_path@") # path
-set(generated_file_internal "@generated_file@") # path
-set(generated_cubin_file_internal "@generated_cubin_file@") # path
-
-set(CUDA_NVCC_EXECUTABLE "@CUDA_NVCC_EXECUTABLE@") # path
-set(CUDA_NVCC_FLAGS @CUDA_NVCC_FLAGS@ ;; @CUDA_WRAP_OPTION_NVCC_FLAGS@) # list
-@CUDA_NVCC_FLAGS_CONFIG@
-set(nvcc_flags @nvcc_flags@) # list
-set(CUDA_NVCC_INCLUDE_ARGS "@CUDA_NVCC_INCLUDE_ARGS@") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "@format_flag@") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-@CUDA_HOST_FLAGS@
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION @CUDA_VERSION@)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/popsift/common/assist.h b/src/popsift/common/assist.h
index e1531eb2..50e6fe6f 100644
--- a/src/popsift/common/assist.h
+++ b/src/popsift/common/assist.h
@@ -7,9 +7,17 @@
  */
 #pragma once
 
+#include <popsift/sift_config.h>
+
 #include <cuda_runtime.h>
 #include <iostream>
-#include <pthread.h> // for pthread_self
+#include <thread>
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+
 
 
 namespace popsift
@@ -17,6 +25,37 @@ namespace popsift
 
 std::ostream& operator<<( std::ostream& ostr, const dim3& p );
 
+/*
+ * Assistance with compatibility-breaking builtin function changes
+ */
+#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_SHFL_DOWN_SYNC)
+template<typename T> __device__ inline T shuffle     ( T variable, int src   ) { return __shfl_sync     ( 0xffffffff, variable, src   ); }
+template<typename T> __device__ inline T shuffle_up  ( T variable, int delta ) { return __shfl_up_sync  ( 0xffffffff, variable, delta ); }
+template<typename T> __device__ inline T shuffle_down( T variable, int delta ) { return __shfl_down_sync( 0xffffffff, variable, delta ); }
+template<typename T> __device__ inline T shuffle_xor ( T variable, int delta ) { return __shfl_xor_sync ( 0xffffffff, variable, delta ); }
+__device__ inline unsigned int ballot( unsigned int pred ) { return __ballot_sync   ( 0xffffffff, pred ); }
+__device__ inline int any            ( unsigned int pred ) { return __any_sync      ( 0xffffffff, pred ); }
+__device__ inline int all            ( unsigned int pred ) { return __all_sync      ( 0xffffffff, pred ); }
+
+template<typename T> __device__ inline T shuffle     ( T variable, int src  , int ws ) { return __shfl_sync     ( 0xffffffff, variable, src  , ws ); }
+template<typename T> __device__ inline T shuffle_up  ( T variable, int delta, int ws ) { return __shfl_up_sync  ( 0xffffffff, variable, delta, ws ); }
+template<typename T> __device__ inline T shuffle_down( T variable, int delta, int ws ) { return __shfl_down_sync( 0xffffffff, variable, delta, ws ); }
+template<typename T> __device__ inline T shuffle_xor ( T variable, int delta, int ws ) { return __shfl_xor_sync ( 0xffffffff, variable, delta, ws ); }
+#else
+template<typename T> __device__ inline T shuffle     ( T variable, int src   ) { return __shfl     ( variable, src   ); }
+template<typename T> __device__ inline T shuffle_up  ( T variable, int delta ) { return __shfl_up  ( variable, delta ); }
+template<typename T> __device__ inline T shuffle_down( T variable, int delta ) { return __shfl_down( variable, delta ); }
+template<typename T> __device__ inline T shuffle_xor ( T variable, int delta ) { return __shfl_xor ( variable, delta ); }
+__device__ inline unsigned int ballot( unsigned int pred ) { return __ballot   ( pred ); }
+__device__ inline int any            ( unsigned int pred ) { return __any      ( pred ); }
+__device__ inline int all            ( unsigned int pred ) { return __all      ( pred ); }
+
+template<typename T> __device__ inline T shuffle     ( T variable, int src  , int ws ) { return __shfl     ( variable, src  , ws ); }
+template<typename T> __device__ inline T shuffle_up  ( T variable, int delta, int ws ) { return __shfl_up  ( variable, delta, ws ); }
+template<typename T> __device__ inline T shuffle_down( T variable, int delta, int ws ) { return __shfl_down( variable, delta, ws ); }
+template<typename T> __device__ inline T shuffle_xor ( T variable, int delta, int ws ) { return __shfl_xor ( variable, delta, ws ); }
+#endif
+
 /* This computation is needed very frequently when a dim3 grid block is
  * initialized. It ensure that the tail is not forgotten.
  */
@@ -43,6 +82,10 @@ float readTex( cudaTextureObject_t tex, float x, float y )
     return tex2D<float>( tex, x+0.5f, y+0.5f );
 }
 
+inline std::thread::id getCurrentThreadId()
+{
+    return std::this_thread::get_id();
+}
 
 /*********************************************************************************
  * For a debug output to cerr with thread ID at the line start
@@ -61,6 +104,50 @@ static inline unsigned int microhash( int val )
                        ^ ( ( val & ( 0xf << 28 ) ) >> 28 ) );
     return ret;
 }
-#define DERR std::cerr << std::hex << popsift::microhash(pthread_self()) << std::dec << "    "
 
-}; // namespace popsift
+static inline unsigned int microhash( const std::thread::id& id )
+{
+    std::hash<std::thread::id> hasher;
+    return microhash( hasher(id) );
+}
+
+#define DERR std::cerr << std::hex << popsift::microhash(getCurrentThreadId()) << std::dec << "    "
+
+
+__host__
+static size_t getPageSize()
+{
+#ifdef _WIN32
+    SYSTEM_INFO si;
+    GetSystemInfo(&si);
+    return si.dwPageSize;
+#else
+    return sysconf(_SC_PAGESIZE);
+#endif
+}
+
+static void* memalign(size_t alignment, size_t size)
+{
+#ifdef _WIN32
+    return _aligned_malloc(size, alignment);
+#else
+    void* ret;
+    int err = posix_memalign( &ret, alignment, size );
+    if( err != 0 ) {
+        errno = err;
+        ret = nullptr;
+    }
+    return ret;
+#endif
+}
+
+static void memalign_free( void* ptr )
+{
+#ifdef _WIN32
+    _aligned_free( ptr );
+#else
+    free( ptr );
+#endif
+}
+
+} // namespace popsift
diff --git a/src/popsift/common/clamp.h b/src/popsift/common/clamp.h
index 50705f05..03a56386 100644
--- a/src/popsift/common/clamp.h
+++ b/src/popsift/common/clamp.h
@@ -7,6 +7,8 @@
  */
 #pragma once
 
+#include <cstdint>
+
 template<class T>
 __device__ __host__
 inline T clamp( T val, uint32_t maxval )
diff --git a/src/popsift/common/debug_macros.cu b/src/popsift/common/debug_macros.cu
index 514b6e4b..c9155248 100755
--- a/src/popsift/common/debug_macros.cu
+++ b/src/popsift/common/debug_macros.cu
@@ -7,7 +7,7 @@
  */
 #include "debug_macros.h"
 
-#include <assert.h>
+#include <cassert>
 
 using namespace std;
 
@@ -20,12 +20,7 @@ void pop_sync_check_last_error( const char* file, size_t line )
 void pop_check_last_error( const char* file, size_t line )
 {
     cudaError_t err = cudaGetLastError( );
-    if( err != cudaSuccess ) {
-        std::cerr << __FILE__ << ":" << __LINE__ << std::endl
-                  << "    called from " << file << ":" << line << std::endl
-                  << "    cudaGetLastError failed: " << cudaGetErrorString(err) << std::endl;
-        exit( -__LINE__ );
-    }
+    POP_CUDA_FATAL_TEST(err, "cudaGetLastError failed: ");
 }
 
 namespace popsift { namespace cuda {
@@ -34,11 +29,7 @@ void malloc_dev( void** ptr, int sz,
 {
     cudaError_t err;
     err = cudaMalloc( ptr, sz );
-    if( err != cudaSuccess ) {
-        std::cerr << file << ":" << line << std::endl
-                  << "    cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
-        exit( -__LINE__ );
-    }
+    POP_CUDA_FATAL_TEST(err, "cudaMalloc failed: ");
 #ifdef DEBUG_INIT_DEVICE_ALLOCATIONS
     popsift::cuda::memset_sync( *ptr, 0, sz, file, line );
 #endif // NDEBUG
@@ -51,11 +42,7 @@ void malloc_hst( void** ptr, int sz,
 {
     cudaError_t err;
     err = cudaMallocHost( ptr, sz );
-    if( err != cudaSuccess ) {
-        std::cerr << file << ":" << line << std::endl
-                  << "    cudaMallocHost failed: " << cudaGetErrorString(err) << std::endl;
-        exit( -__LINE__ );
-    }
+    POP_CUDA_FATAL_TEST(err, "cudaMallocHost failed: ");
 #ifdef DEBUG_INIT_DEVICE_ALLOCATIONS
     memset( *ptr, 0, sz );
 #endif // NDEBUG
@@ -74,16 +61,13 @@ void memcpy_async( void* dst, const void* src, size_t sz,
     cudaError_t err;
     err = cudaMemcpyAsync( dst, src, sz, type, stream );
     if( err != cudaSuccess ) {
-        cerr << file << ":" << line << endl
-             << "    " << "Failed to copy "
-             << (type==cudaMemcpyHostToDevice?"host-to-device":"device-to-host")
-             << ": ";
-        cerr << cudaGetErrorString(err) << endl;
-        cerr << "    src ptr=" << hex << (size_t)src << dec << endl
-             << "    dst ptr=" << hex << (size_t)dst << dec << endl;
-        exit( -__LINE__ );
+        std::stringstream ss;
+        ss << "Failed to copy " << (type == cudaMemcpyHostToDevice ? "host-to-device" : "device-to-host") << ": ";
+        ss << cudaGetErrorString(err) << endl;
+        ss << "    src ptr=" << hex << (size_t)src << dec << endl
+           << "    dst ptr=" << hex << (size_t)dst << dec << endl;
+        POP_FATAL(ss.str());
     }
-    POP_CUDA_FATAL_TEST( err, "Failed to copy host-to-device: " );
 }
 
 void memcpy_sync( void* dst, const void* src, size_t sz, cudaMemcpyKind type, const char* file, size_t line )
@@ -95,37 +79,27 @@ void memcpy_sync( void* dst, const void* src, size_t sz, cudaMemcpyKind type, co
     cudaError_t err;
     err = cudaMemcpy( dst, src, sz, type );
     if( err != cudaSuccess ) {
-        cerr << "    " << "Failed to copy "
-             << (type==cudaMemcpyHostToDevice?"host-to-device":"device-to-host")
-             << ": ";
-        cerr << cudaGetErrorString(err) << endl;
-        cerr << "    src ptr=" << hex << (size_t)src << dec << endl
-             << "    dst ptr=" << hex << (size_t)dst << dec << endl;
-        exit( -__LINE__ );
+        std::stringstream ss;
+        ss << "Failed to copy " << (type == cudaMemcpyHostToDevice ? "host-to-device" : "device-to-host") << ": ";
+        ss << cudaGetErrorString(err) << endl;
+        ss << "    src ptr=" << hex << (size_t)src << dec << endl
+           << "    dst ptr=" << hex << (size_t)dst << dec << endl;
+        POP_FATAL(ss.str())
     }
-    POP_CUDA_FATAL_TEST( err, "Failed to copy host-to-device: " );
 }
 
 void memset_async( void* ptr, int value, size_t bytes, cudaStream_t stream, const char* file, size_t line )
 {
     cudaError_t err;
     err = cudaMemsetAsync( ptr, value, bytes, stream );
-    if( err != cudaSuccess ) {
-        std::cerr << file << ":" << line << std::endl
-                  << "    cudaMemsetAsync failed: " << cudaGetErrorString(err) << std::endl;
-        exit( -__LINE__ );
-    }
+    POP_CUDA_FATAL_TEST(err, "cudaMemsetAsync failed: ");
 }
 
 void memset_sync( void* ptr, int value, size_t bytes, const char* file, size_t line )
 {
     cudaError_t err;
     err = cudaMemset( ptr, value, bytes );
-    if( err != cudaSuccess ) {
-        std::cerr << file << ":" << line << std::endl
-                  << "    cudaMemset failed: " << cudaGetErrorString(err) << std::endl;
-        exit( -__LINE__ );
-    }
+    POP_CUDA_FATAL_TEST(err, "cudaMemset failed: ");
 }
 } }
 
@@ -135,68 +109,44 @@ cudaStream_t stream_create( const char* file, size_t line )
     cudaStream_t stream;
     cudaError_t err;
     err = cudaStreamCreate( &stream );
-    if( err != cudaSuccess ) {
-        std::cerr << file << ":" << line << std::endl
-                  << "    cudaStreamCreate failed: " << cudaGetErrorString(err) << std::endl;
-        exit( -__LINE__ );
-    }
+    POP_CUDA_FATAL_TEST(err, "cudaStreamCreate failed: ");
     return stream;
 }
 void stream_destroy( cudaStream_t s, const char* file, size_t line )
 {
     cudaError_t err;
     err = cudaStreamDestroy( s );
-    if( err != cudaSuccess ) {
-        std::cerr << file << ":" << line << std::endl
-                  << "    cudaStreamDestroy failed: " << cudaGetErrorString(err) << std::endl;
-        exit( -__LINE__ );
-    }
+    POP_CUDA_FATAL_TEST(err, "cudaStreamDestroy failed: ");
 }
 cudaEvent_t event_create( const char* file, size_t line )
 {
     cudaEvent_t ev;
     cudaError_t err;
     err = cudaEventCreate( &ev );
-    if( err != cudaSuccess ) {
-        std::cerr << file << ":" << line << std::endl
-                  << "    cudaEventCreate failed: " << cudaGetErrorString(err) << std::endl;
-        exit( -__LINE__ );
-    }
+    POP_CUDA_FATAL_TEST(err, "cudaEventCreate failed: ");
     return ev;
 }
 void event_destroy( cudaEvent_t ev, const char* file, size_t line )
 {
     cudaError_t err;
     err = cudaEventDestroy( ev );
-    if( err != cudaSuccess ) {
-        std::cerr << file << ":" << line << std::endl
-                  << "    cudaEventDestroy failed: " << cudaGetErrorString(err) << std::endl;
-        exit( -__LINE__ );
-    }
+    POP_CUDA_FATAL_TEST(err, "cudaEventDestroy failed: ");
 }
 void event_record( cudaEvent_t ev, cudaStream_t s, const char* file, size_t line )
 {
     cudaError_t err;
     err = cudaEventRecord( ev, s );
-    if( err != cudaSuccess ) {
-        std::cerr << file << ":" << line << std::endl
-                  << "    cudaEventRecord failed: " << cudaGetErrorString(err) << std::endl;
-        exit( -__LINE__ );
-    }
+    POP_CUDA_FATAL_TEST(err, "cudaEventRecord failed: ");
 }
 void event_wait( cudaEvent_t ev, cudaStream_t s, const char* file, size_t line )
 {
     cudaError_t err;
     err = cudaStreamWaitEvent( s, ev, 0 );
-    if( err != cudaSuccess ) {
-        std::cerr << file << ":" << line << std::endl
-                  << "    cudaStreamWaitEvent failed: " << cudaGetErrorString(err) << std::endl;
-        exit( -__LINE__ );
-    }
+    POP_CUDA_FATAL_TEST(err, "cudaStreamWaitEvent failed: ");
 }
 
 float event_diff( cudaEvent_t from, cudaEvent_t to )
-{   
+{
     float ms;
     cudaEventElapsedTime( &ms, from, to );
     return ms;
diff --git a/src/popsift/common/debug_macros.h b/src/popsift/common/debug_macros.h
index 71b271fb..a497750c 100755
--- a/src/popsift/common/debug_macros.h
+++ b/src/popsift/common/debug_macros.h
@@ -7,12 +7,15 @@
  */
 #pragma once
 
-#include <iostream>
+#include <cuda_runtime.h>
+
+#include <cassert>
+#include <cstdlib>
 #include <iomanip>
+#include <iostream>
+#include <stdexcept>
 #include <string>
-#include <stdlib.h>
-#include <assert.h>
-#include <cuda_runtime.h>
+#include <sstream>
 
 // synchronize device and check for an error
 void pop_sync_check_last_error( const char* file, size_t line );
@@ -116,14 +119,18 @@ class BriefDuration
 };
 };
 
-#define POP_FATAL(s) { \
-        std::cerr << __FILE__ << ":" << __LINE__ << std::endl << "    " << s << std::endl; \
-        exit( -__LINE__ ); \
+#define POP_FATAL(s)                                                                                                   \
+    {                                                                                                                  \
+        std::stringstream ss;                                                                                          \
+        ss << __FILE__ << ":" << __LINE__ << std::endl << "    " << s;                                                 \
+        throw std::runtime_error{ss.str()};                                                                            \
     }
 
-#define POP_FATAL_FL(s,file,line) { \
-        std::cerr << file << ":" << line << std::endl << "    " << s << std::endl; \
-        exit( -__LINE__ ); \
+#define POP_FATAL_FL(s, file, line)                                                                                    \
+    {                                                                                                                  \
+        std::stringstream ss;                                                                                          \
+        ss << file << ":" << line << std::endl << "    " << s << std::endl;                                            \
+        throw std::runtime_error{ss.str()};                                                                            \
     }
 
 #define POP_CHECK_NON_NULL(ptr,s) if( ptr == 0 ) { POP_FATAL_FL(s,__FILE__,__LINE__); }
@@ -134,14 +141,24 @@ class BriefDuration
 // #define POP_INFO(s) cerr << __FILE__ << ":" << __LINE__ << std::endl << "    " << s << endl
 
 #define POP_INFO2(silent,s) \
-    if (not silent) { \
+    if (! silent) { \
         std::cerr << __FILE__ << ":" << __LINE__ << std::endl << "    " << s << std::endl; \
     }
 
-#define POP_CUDA_FATAL(err,s) { \
+#define POP_WARN(s) { \
+        std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+        std::cerr << "    WARNING: " << s << std::endl; \
+    }
+#define POP_CUDA_WARN(err,s) { \
         std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
-        std::cerr << "    " << s << cudaGetErrorString(err) << std::endl; \
-        exit( -__LINE__ ); \
+        std::cerr << "    WARNING: " << s << cudaGetErrorString(err) << std::endl; \
+    }
+#define POP_CUDA_FATAL(err,s)                                                                                         \
+    {                                                                                                                  \
+        std::stringstream ss;                                                                                          \
+        ss << __FILE__ << ":" << __LINE__ << std::endl;                                                                \
+        ss << "    " << s << cudaGetErrorString(err) << std::endl;                                                     \
+        throw std::runtime_error{ss.str()};                                                                            \
     }
 #define POP_CUDA_FATAL_TEST(err,s) if( err != cudaSuccess ) { POP_CUDA_FATAL(err,s); }
 
diff --git a/src/popsift/common/device_prop.cu b/src/popsift/common/device_prop.cu
index 14bf75ef..44d47c1d 100644
--- a/src/popsift/common/device_prop.cu
+++ b/src/popsift/common/device_prop.cu
@@ -5,30 +5,45 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
+#include "debug_macros.h"
+#include "device_prop.h"
 #include <iostream>
 #include <sstream>
 
-#include "device_prop.h"
-#include "debug_macros.h"
-
 namespace popsift { namespace cuda {
 
 using namespace std;
 
+void reset()
+{
+    cudaDeviceReset();
+}
+
+void sync()
+{
+    cudaDeviceSynchronize();
+}
+
 device_prop_t::device_prop_t( )
 {
+    int         currentDevice;
     cudaError_t err;
 
+    err = cudaGetDevice( &currentDevice );
+    POP_CUDA_FATAL_TEST( err, "Cannot get the current CUDA device" );
+
     err = cudaGetDeviceCount( &_num_devices );
     POP_CUDA_FATAL_TEST( err, "Cannot count devices" );
 
-    for( int n=0; n<_num_devices; n++ ) {
-        cudaDeviceProp* p;
-        _properties.push_back( p = new cudaDeviceProp );
-        err = cudaGetDeviceProperties( p, n );
+    _properties.resize(_num_devices);
+
+    for( int n=0; n<_num_devices; ++n ) {
+        _properties[n] = new cudaDeviceProp;
+        err = cudaGetDeviceProperties( _properties[n], n );
         POP_CUDA_FATAL_TEST( err, "Cannot get properties for a device" );
     }
-    err = cudaSetDevice( 0 );
+
+    err = cudaSetDevice( currentDevice );
     POP_CUDA_FATAL_TEST( err, "Cannot set device 0" );
 }
 
@@ -86,5 +101,221 @@ device_prop_t::~device_prop_t( )
     }
 }
 
+bool device_prop_t::checkLimit_2DtexLinear( int& width, int& height, bool printWarn ) const
+{
+    bool        returnSuccess = true;
+    int         currentDevice;
+    cudaError_t err;
+
+    err = cudaGetDevice( &currentDevice );
+    if( err != cudaSuccess )
+    {
+        POP_CUDA_WARN( err, "Cannot get current CUDA device" );
+        return true;
+    }
+
+    if( currentDevice >= _properties.size() )
+    {
+        POP_WARN( "CUDA device was not registered at program start" );
+        return true;
+    }
+
+    const cudaDeviceProp* ptr = _properties[currentDevice];
+    if( width > ptr->maxTexture2DLayered[0] )
+    {
+        if( printWarn )
+        {
+            std::cerr << __FILE__ << ":" << __LINE__
+                      << ": CUDA device " << currentDevice << std::endl
+                      << "    does not support 2D linear textures " << width
+                      << " pixels wide." << endl;
+        }
+        width = ptr->maxTexture2DLayered[0];
+        returnSuccess = false;
+    }
+    if( height > ptr->maxTexture2DLayered[1] )
+    {
+        if( returnSuccess && printWarn )
+        {
+            std::cerr << __FILE__ << ":" << __LINE__
+                      << ": CUDA device " << currentDevice << std::endl
+                      << "    does not support 2D linear textures " << height
+                      << " pixels high." << endl;
+        }
+        height = ptr->maxTexture2DLayered[1];
+        returnSuccess = false;
+    }
+
+    return returnSuccess;
+}
+
+bool device_prop_t::checkLimit_2DtexArray( int& width, int& height, bool printWarn ) const
+{
+    bool        returnSuccess = true;
+    int         currentDevice;
+    cudaError_t err;
+
+    err = cudaGetDevice( &currentDevice );
+    if( err != cudaSuccess )
+    {
+        POP_CUDA_WARN( err, "Cannot get current CUDA device" );
+        return true;
+    }
+
+    if( currentDevice >= _properties.size() )
+    {
+        POP_WARN( "CUDA device was not registered at program start" );
+        return true;
+    }
+
+    const cudaDeviceProp* ptr = _properties[currentDevice];
+    if( width > ptr->maxTexture2D[0] )
+    {
+        if( printWarn )
+        {
+            std::cerr << __FILE__ << ":" << __LINE__
+                      << ": CUDA device " << currentDevice << std::endl
+                      << "    does not support 2D array textures " << width
+                      << " pixels wide." << endl;
+        }
+        width = ptr->maxTexture2D[0];
+        returnSuccess = false;
+    }
+    if( height > ptr->maxTexture2D[1] )
+    {
+        if( returnSuccess && printWarn )
+        {
+            std::cerr << __FILE__ << ":" << __LINE__
+                      << ": CUDA device " << currentDevice << std::endl
+                      << "    does not support 2D array textures " << height
+                      << " pixels high." << endl;
+        }
+        height = ptr->maxTexture2D[1];
+        returnSuccess = false;
+    }
+
+    return returnSuccess;
+}
+
+bool device_prop_t::checkLimit_2DtexLayered( int& width, int& height, int& layers, bool printWarn ) const
+{
+    bool        returnSuccess = true;
+    int         currentDevice;
+    cudaError_t err;
+
+    err = cudaGetDevice( &currentDevice );
+    if( err != cudaSuccess )
+    {
+        POP_CUDA_WARN( err, "Cannot get current CUDA device" );
+        return true;
+    }
+
+    if( currentDevice >= _properties.size() )
+    {
+        POP_WARN( "CUDA device was not registered at program start" );
+        return true;
+    }
+
+    const cudaDeviceProp* ptr = _properties[currentDevice];
+    if( width > ptr->maxTexture2DLayered[0] )
+    {
+        if( printWarn )
+        {
+            std::cerr << __FILE__ << ":" << __LINE__
+                      << ": CUDA device " << currentDevice << std::endl
+                      << "    does not support 2D array textures " << width
+                      << " pixels wide." << endl;
+        }
+        width = ptr->maxTexture2DLayered[0];
+        returnSuccess = false;
+    }
+    if( height > ptr->maxTexture2DLayered[1] )
+    {
+        if( returnSuccess && printWarn )
+        {
+            std::cerr << __FILE__ << ":" << __LINE__
+                      << ": CUDA device " << currentDevice << std::endl
+                      << "    does not support 2D array textures " << height
+                      << " pixels high." << endl;
+        }
+        height = ptr->maxTexture2DLayered[1];
+        returnSuccess = false;
+    }
+    if( layers > ptr->maxTexture2DLayered[2] )
+    {
+        if( returnSuccess && printWarn )
+        {
+            std::cerr << __FILE__ << ":" << __LINE__
+                      << ": CUDA device " << currentDevice << std::endl
+                      << "    does not support 2D array textures " << layers
+                      << " pixels deep." << endl;
+        }
+        layers = ptr->maxTexture2DLayered[2];
+        returnSuccess = false;
+    }
+
+    return returnSuccess;
+}
+
+bool device_prop_t::checkLimit_2DsurfLayered( int& width, int& height, int& layers, bool printWarn ) const
+{
+    bool        returnSuccess = true;
+    int         currentDevice;
+    cudaError_t err;
+
+    err = cudaGetDevice( &currentDevice );
+    if( err != cudaSuccess )
+    {
+        POP_CUDA_WARN( err, "Cannot get current CUDA device" );
+        return true;
+    }
+
+    if( currentDevice >= _properties.size() )
+    {
+        POP_WARN( "CUDA device was not registered at program start" );
+        return true;
+    }
+
+    const cudaDeviceProp* ptr = _properties[currentDevice];
+    if( width > ptr->maxSurface2DLayered[0] )
+    {
+        if( printWarn )
+        {
+            std::cerr << __FILE__ << ":" << __LINE__
+                      << ": CUDA device " << currentDevice << std::endl
+                      << "    does not support layered 2D surfaces " << width
+                      << " pixels wide." << endl;
+        }
+        width = ptr->maxSurface2DLayered[0];
+        returnSuccess = false;
+    }
+    if( height > ptr->maxSurface2DLayered[1] )
+    {
+        if( returnSuccess && printWarn )
+        {
+            std::cerr << __FILE__ << ":" << __LINE__
+                      << ": CUDA device " << currentDevice << std::endl
+                      << "    does not support layered 2D surfaces " << height
+                      << " pixels high." << endl;
+        }
+        height = ptr->maxSurface2DLayered[1];
+        returnSuccess = false;
+    }
+    if( layers > ptr->maxSurface2DLayered[2] )
+    {
+        if( returnSuccess && printWarn )
+        {
+            std::cerr << __FILE__ << ":" << __LINE__
+                      << ": CUDA device " << currentDevice << std::endl
+                      << "    does not support layered 2D surfaces " << layers
+                      << " pixels deep." << endl;
+        }
+        layers = ptr->maxSurface2DLayered[2];
+        returnSuccess = false;
+    }
+
+    return returnSuccess;
+}
+
 }}
 
diff --git a/src/popsift/common/device_prop.h b/src/popsift/common/device_prop.h
index 04b3870f..8910e9c8 100644
--- a/src/popsift/common/device_prop.h
+++ b/src/popsift/common/device_prop.h
@@ -7,21 +7,112 @@
  */
 #pragma once
 
-#include <vector>
 #include <cuda_runtime.h>
+#include <vector>
+
+namespace popsift {
+namespace cuda {
+
+/** A call to cudaDeviceReset()
+ */
+void reset();
+
+/** A call to cudaDeviceSynchronize()
+ */
+void sync();
 
-namespace popsift { namespace cuda {
 
+/**
+ * @brief A class to recover, query and print the information about the cuda device.
+ */
 class device_prop_t
 {
     int _num_devices;
     std::vector<cudaDeviceProp*> _properties;
+
+public:
+    enum {
+        do_warn = true,
+        dont_warn = false
+    };
+
 public:
     device_prop_t( );
     ~device_prop_t( );
 
+    /**
+     * @brief Print the information about the device.
+     */
     void print( );
+
+    /**
+     * @brief Set the device to use.
+     * @param[in] n The index of the device to use.
+     * @param[in] print_choice Whether to print information about the chosen device.
+     */
     void set( int n, bool print_choice = false );
+
+    /**
+     * @brief Check if a request exceeds the current CUDA device's limit in
+     *  texture2Dlinear dimensions. texture2Dlinear is based on CUDA memory that
+     *  can be accessed directly (i.e. no CudaArray).
+     * @param[in,out] width  Desired width of the texture.
+     * @param[in,out] height Desired height of the texture.
+     * @param[in]     printWarn if true, print warnings to cerr if desired width
+     *                          or height exceeds limits.
+     * @return   \p true if the desired width and height are possible.
+     *           \p false if one or both of the desired width and height are impossible.
+     *           The desired width or height (or both) are replaced by the limit.
+     */
+    bool checkLimit_2DtexLinear( int& width, int& height, bool printWarn ) const;
+
+    /**
+     * @brief Check if a request exceeds the current CUDA device's limit in
+     *  texture2D dimensions. texture2D is based on CUDA Arrays, which have
+     *  invisible layout and can only be filled with cudaMemcpy.
+     * @param[in,out] width  Desired width of the texture.
+     * @param[in,out] height Desired height of the texture.
+     * @param[in]     printWarn if true, print warnings to cerr if desired width
+     *                          or height exceeds limits.
+     * @return   \p true if the desired width and height are possible.
+     *           \p false if one or both of the desired width and height are impossible.
+     *           The desired width or height (or both) are replaced by the limit.
+     */
+    bool checkLimit_2DtexArray( int& width, int& height, bool printWarn ) const;
+
+    /**
+     * @brief Check if a request exceeds the current CUDA device's limit in
+     *  texture2DLayered dimensions. texture2DLayered refers to a 3D structure, where
+     *  interpolation happens only in 3D, effectively creating layers.
+     * @param[in,out] width  Desired width of the texture.
+     * @param[in,out] height Desired height of the texture.
+     * @param[in,out] layers Desired depth of the texture.
+     * @param[in]     printWarn if true, print warnings to cerr if desired width
+     *                          or height exceeds limits.
+     * @return   \p true if the desired width, height and depth are possible.
+     *           \p false if one or both of the desired width and height are impossible.
+     *           The desired width, height and layers are replaced by the limit
+     *           if they exceed it.
+     */
+    bool checkLimit_2DtexLayered( int& width, int& height, int& layers,
+                                  bool printWarn ) const;
+
+    /**
+     * @brief Check if a request exceeds the current CUDA device's limit in
+     *  surface2DLayered dimensions. surface2DLayered is the writable equivalent
+     *  to texture2DLayered.
+     * @param[in,out] width  Desired width of the texture.
+     * @param[in,out] height Desired height of the texture.
+     * @param[in,out] layers Desired depth of the texture.
+     * @param[in]     printWarn if true, print warnings to cerr if desired width
+     *                          or height exceeds limits.
+     * @return   \p true if the desired width, height and depth are possible.
+     *           \p false if one or both of the desired width and height are impossible.
+     *           The desired width, height and layers are replaced by the limit
+     *           if they exceed it.
+     */
+    bool checkLimit_2DsurfLayered( int& width, int& height, int& layers,
+                                   bool printWarn ) const;
 };
 
 }}
diff --git a/src/popsift/common/excl_blk_prefix_sum.h b/src/popsift/common/excl_blk_prefix_sum.h
index b8c3b6dc..d77bf7b0 100644
--- a/src/popsift/common/excl_blk_prefix_sum.h
+++ b/src/popsift/common/excl_blk_prefix_sum.h
@@ -7,6 +7,8 @@
  */
 #pragma once
 
+#include "assist.h"
+
 #include <cuda_runtime.h>
 #include <typeinfo>
 
@@ -90,7 +92,7 @@ class Block
 
             // This loop is an exclusive prefix sum for one warp
             for( int s=0; s<5; s++ ) {
-                const int add = __shfl_up( ews+self, 1<<s );
+                const int add = popsift::shuffle_up( ews+self, 1<<s );
                 ews += threadIdx.x < (1<<s) ? 0 : add;
             }
 
@@ -107,7 +109,7 @@ class Block
                 int self = sum[threadIdx.x];
 
                 for( int s=0; s<5; s++ ) {
-                    const int add = __shfl_up( ebs+self, 1<<s );
+                    const int add = popsift::shuffle_up( ebs+self, 1<<s );
                     ebs += threadIdx.x < (1<<s) ? 0 : add;
                 }
 
@@ -130,6 +132,7 @@ class Block
                  */
                 _mapping_writer.set( ebs, self, cell );
             }
+            __syncthreads();
 
             if( threadIdx.y == 0 && threadIdx.x == 31 ) {
                 loop_total += ibs;
@@ -137,12 +140,7 @@ class Block
             __syncthreads();
         }
 
-        // if( threadIdx.y == 0 && threadIdx.x == 31 )
-        if( threadIdx.y == 0 )
-        {
-            loop_total = __shfl( loop_total, 31 );
-            _total_writer.set( loop_total );
-        }
+        _total_writer.set( loop_total );
     }
 };
 
diff --git a/src/popsift/common/plane_2d.cu b/src/popsift/common/plane_2d.cu
index 01b95a5d..89ba7d34 100644
--- a/src/popsift/common/plane_2d.cu
+++ b/src/popsift/common/plane_2d.cu
@@ -5,9 +5,17 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
+
+#include "assist.h"
+#include "debug_macros.h"
+#include "plane_2d.h"
+
+#include <cuda_runtime.h>
+
+#include <cstdlib>
+#include <cstring>
 #include <iostream>
-#include <string.h>
-#include <stdlib.h>
+#include <sstream>
 #ifndef _WIN32
 #include <unistd.h>
 #else
@@ -17,10 +25,7 @@
 #include <malloc.h>
 #endif
 
-#include <cuda_runtime.h>
 
-#include "plane_2d.h"
-#include "debug_macros.h"
 
 using namespace std;
 
@@ -45,18 +50,6 @@ void PlaneBase::freeDev2D( void* data )
     POP_CUDA_FATAL_TEST( err, "Failed to free CUDA memory: " );
 }
 
-__host__
-static long GetPageSize()
-{
-#ifdef _WIN32
-    SYSTEM_INFO si;
-    GetSystemInfo(&si);
-    return si.dwPageSize;
-#else
-    return sysconf(_SC_PAGESIZE);
-#endif
-}
-
 __host__
 void* PlaneBase::allocHost2D( int w, int h, int elemSize, PlaneMapMode m )
 {
@@ -73,20 +66,14 @@ void* PlaneBase::allocHost2D( int w, int h, int elemSize, PlaneMapMode m )
 #else
         const char *buf = strerror(errno);
 #endif
-        cerr << __FILE__ << ":" << __LINE__ << endl
-             << "    Failed to allocate " << sz << " bytes of unaligned host memory." << endl
-             << "    Cause: " << buf << endl;
-        exit( -1 );
-    } else if( m == PageAligned ) {
-        void* ptr;
-
-#ifdef _WIN32
-        ptr = _aligned_malloc(sz, GetPageSize());
-        if (ptr) return ptr;
-#else
-        int retval = posix_memalign( &ptr, GetPageSize(), sz );
-        if( retval == 0 ) return ptr;
-#endif
+        stringstream ss;
+        ss << "Failed to allocate " << sz << " bytes of unaligned host memory." << endl
+           << "Cause: " << buf;
+        POP_FATAL(ss.str());
+    } else if(m == PageAligned) {
+        void* ptr = memalign(getPageSize(), sz);
+        if(ptr)
+            return ptr;
 
 #ifdef _GNU_SOURCE
         char b[100];
@@ -107,9 +94,7 @@ void* PlaneBase::allocHost2D( int w, int h, int elemSize, PlaneMapMode m )
         POP_CUDA_FATAL_TEST( err, "Failed to allocate aligned and pinned host memory: " );
         return ptr;
     } else {
-        cerr << __FILE__ << ":" << __LINE__ << endl
-             << "    Alignment not correctly specified in host plane allocation" << endl;
-        exit( -1 );
+        POP_FATAL("Alignment not correctly specified in host plane allocation");
     }
 }
 
@@ -118,20 +103,16 @@ void PlaneBase::freeHost2D( void* data, PlaneMapMode m )
 {
     if (!data)
         return;
-    if (m == CudaAllocated) {
+    else if (m == CudaAllocated) {
         cudaFreeHost(data);
         return;
     }
-    if (m == Unaligned) {
+    else if (m == Unaligned) {
         free(data);
         return;
     }
-    if (m == PageAligned) {
-#ifdef _WIN32
-	_aligned_free(data);
-#else
-	free(data);
-#endif
+    else if (m == PageAligned) {
+        memalign_free( data );
         return;
     }
     assert(!"Invalid PlaneMapMode");
diff --git a/src/popsift/common/plane_2d.h b/src/popsift/common/plane_2d.h
index dd978e42..8202e7a5 100644
--- a/src/popsift/common/plane_2d.h
+++ b/src/popsift/common/plane_2d.h
@@ -6,13 +6,17 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 #pragma once
-
-#include <assert.h>
-#include <inttypes.h>
-#include <errno.h>
-#include <stdlib.h>
 #include <cuda_runtime.h>
+
+#include <cassert>
+#include <cerrno>
+#include <cinttypes>
+#include <cstdlib>
 #include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+#include "debug_macros.h"
 
 #define PLANE2D_CUDA_OP_DEBUG
 
@@ -90,7 +94,7 @@ template <typename T> struct PlaneT : public PlaneBase
     T* data;
 
     __host__ __device__ PlaneT( )      : data(0) { }
-    __host__ __device__ PlaneT( T* d ) : data(d) { }
+    __host__ __device__ explicit PlaneT( T* d ) : data(d) { }
 
     __host__ __device__ inline size_t elemSize() const { return elem_size; }
 };
@@ -104,13 +108,11 @@ template <typename T> struct PlaneT : public PlaneBase
 
 template <typename T> struct PitchPlane2D : public PlaneT<T>
 {
-    int step; // this is the pitch width in bytes!!!
-
     __host__ __device__
-    PitchPlane2D( ) : step(0) { }
+    PitchPlane2D( ) : _pitchInBytes(0) { }
 
     __host__ __device__
-    PitchPlane2D( T* d, int s ) : PlaneT<T>(d) , step(s) { }
+    PitchPlane2D( T* d, int s ) : PlaneT<T>(d) , _pitchInBytes(s) { }
 
     /** cuda memcpy from this (plane allocated on host) to
      *  parameter (plane allocated on device) */
@@ -141,16 +143,16 @@ template <typename T> struct PitchPlane2D : public PlaneT<T>
                                        short cols, short rows, cudaStream_t stream );
 
     __host__ __device__ inline const T* ptr( int y ) const {
-        return (const T*)( (const char*)this->data + y * step );
+        return (const T*)( (const char*)this->data + y * _pitchInBytes );
     }
     __host__ __device__ inline       T* ptr( int y )       {
-        return (T*)( (char*)this->data + y * step );
+        return (T*)( (char*)this->data + y * _pitchInBytes );
     }
 
     __host__ inline void allocDev( int w, int h ) {
         size_t pitch;
         this->data = (T*)PlaneBase::allocDev2D( pitch, w, h, this->elemSize() );
-        this->step = pitch;
+        this->_pitchInBytes = pitch;
     }
 
     __host__ inline void freeDev( ) {
@@ -161,14 +163,17 @@ template <typename T> struct PitchPlane2D : public PlaneT<T>
 
     __host__ inline void allocHost( int w, int h, PlaneMapMode mode ) {
         this->data = (T*)PlaneBase::allocHost2D( w, h, this->elemSize(), mode );
-        this->step = w * this->elemSize();
+        this->_pitchInBytes = w * this->elemSize();
     }
 
     __host__ inline void freeHost( PlaneMapMode mode ) {
         PlaneBase::freeHost2D( this->data, mode );
     }
     __host__ __device__
-    inline short getPitch( ) const { return step; }
+    inline size_t getPitchInBytes( ) const { return _pitchInBytes; }
+
+protected:
+    size_t _pitchInBytes; // pitch width in bytes
 };
 
 /*************************************************************
@@ -181,8 +186,8 @@ template <typename T>
 __host__
 inline void PitchPlane2D<T>::memcpyToDevice( PitchPlane2D<T>& devPlane, short cols, short rows )
 {
-    PlaneBase::memcpyToDevice( devPlane.data, devPlane.step,
-                               this->data, this->step,
+    PlaneBase::memcpyToDevice( devPlane.data, devPlane._pitchInBytes,
+                               this->data, this->_pitchInBytes,
                                cols, rows,
                                sizeof(T) );
 }
@@ -191,8 +196,8 @@ template <typename T>
 __host__
 inline void PitchPlane2D<T>::memcpyToDevice( PitchPlane2D<T>& devPlane, short cols, short rows, cudaStream_t stream )
 {
-    PlaneBase::memcpyToDevice( devPlane.data, devPlane.step,
-                               this->data, this->step,
+    PlaneBase::memcpyToDevice( devPlane.data, devPlane._pitchInBytes,
+                               this->data, this->_pitchInBytes,
                                cols, rows,
                                sizeof(T),
                                stream );
@@ -216,8 +221,8 @@ template <typename T>
 __host__
 inline void PitchPlane2D<T>::memcpyFromDevice( PitchPlane2D<T>& devPlane, short cols, short rows )
 {
-    PlaneBase::memcpyToHost( this->data, this->step,
-                             devPlane.data, devPlane.step,
+    PlaneBase::memcpyToHost( this->data, this->_pitchInBytes,
+                             devPlane.data, devPlane._pitchInBytes,
                              cols, rows,
                              sizeof(T) );
 }
@@ -226,8 +231,8 @@ template <typename T>
 __host__
 inline void PitchPlane2D<T>::memcpyFromDevice( PitchPlane2D<T>& devPlane, short cols, short rows, cudaStream_t stream )
 {
-    PlaneBase::memcpyToHost( this->data, this->step,
-                             devPlane.data, devPlane.step,
+    PlaneBase::memcpyToHost( this->data, this->_pitchInBytes,
+                             devPlane.data, devPlane._pitchInBytes,
                              cols, rows,
                              sizeof(T),
                              stream );
@@ -275,7 +280,7 @@ template <typename T> class Plane2D : public PitchPlane2D<T>
     template <typename U>
     __host__ __device__
     explicit Plane2D( const Plane2D<U>& orig )
-        : PitchPlane2D<T>( (T*)orig.data, orig.step )
+        : PitchPlane2D<T>( (T*)orig.data, orig._pitchInBytes )
         , _rows( orig.getRows() )
     {
         // careful computation: cols is a short
@@ -285,10 +290,18 @@ template <typename T> class Plane2D : public PitchPlane2D<T>
     }
 
     /** Overwrite the width and height information. Useful if smaller
-     *  planes should be loaded into larger preallocated planes
+     *  planes should be loaded into larger preallocated host planes
+     *  without actually allocating again, but dangerous.
+     *  @warning: pitch is updated (host side)
+     */
+    __host__ void resetDimensionsHost( int w, int h );
+
+    /** Overwrite the width and height information. Useful if smaller
+     *  planes should be loaded into larger preallocated device planes
      *  without actually allocating again, but dangerous.
+     *  @warning: pitch is not updated (device side)
      */
-    __host__ void resetDimensions( int w, int h );
+    __host__ void resetDimensionsDev( int w, int h );
 
     /** cuda memcpy from this (plane allocated on host) to
      *  parameter (plane allocated on device) */
@@ -327,7 +340,7 @@ template <typename T> class Plane2D : public PitchPlane2D<T>
     __host__ __device__
     inline short getHeight( ) const { return _rows; }
     __host__ __device__
-    inline short getByteSize( ) const { return this->step*_rows; }
+    inline size_t getByteSize( ) const { return this->_pitchInBytes * _rows; }
 
     __host__ inline void allocDev( int w, int h ) {
         _cols = w;
@@ -350,16 +363,29 @@ template <typename T> class Plane2D : public PitchPlane2D<T>
 
 template <typename T>
 __host__
-void Plane2D<T>::resetDimensions( int w, int h )
+void Plane2D<T>::resetDimensionsHost( int w, int h )
+{
+    this->_cols = w;
+    this->_rows = h;
+    // on the host side, memory is contiguous (no padding) => pitch must be updated to match data
+    this->_pitchInBytes  = w * this->elemSize();
+}
+
+template <typename T>
+__host__
+void Plane2D<T>::resetDimensionsDev( int w, int h )
 {
-    if( w*sizeof(T) > this->getPitch() ) {
-        std::cerr << __FILE__ << ":" << __LINE__ << std::endl
-                  << "    Error: trying to reinterpret plane width to " << w << " units a " << sizeof(T) << " bytes, "
-                     "only " << this->getPitch() << " bytes allocated" << std::endl;
-        exit( -1 );
+    // validate pitch
+    if( w * this->elemSize() > this->getPitchInBytes() ) {
+        std::stringstream err; 
+        err << __FILE__ << ":" << __LINE__ << std::endl
+        << " Error: trying to reinterpret plane width to " << w << " units a " << sizeof(T) << " bytes, "
+        << "only " << this->getPitchInBytes() << " bytes allocated";
+        throw std::runtime_error(err.str());
     }
     this->_cols = w;
     this->_rows = h;
+    // on the device side, memory is NOT contiguous (CUDA may add padding) => pitch can not be changed without reallocation
 }
 
 template <typename T>
@@ -383,14 +409,16 @@ __host__
 inline void Plane2D<T>::memcpyToDevice( Plane2D<T>& devPlane, cudaStream_t stream )
 {
     if( devPlane._cols != this->_cols ) {
-        std::cerr << __FILE__ << ":" << __LINE__ << std::endl
-                  << "    Error: source columns (" << this->_cols << ") and dest columns (" << devPlane._cols << ") must be identical" << std::endl;
-        exit( -1 );
+        std::stringstream ss;
+        ss << "Error: source columns (" << this->_cols << ") and dest columns (" << devPlane._cols
+           << ") must be identical";
+        POP_FATAL(ss.str());
     }
     if( devPlane._rows != this->_rows ) {
-        std::cerr << __FILE__ << ":" << __LINE__ << std::endl
-                  << "    Error: source rows (" << this->_rows << ") and dest rows (" << devPlane._rows << ") must be identical" << std::endl;
-        exit( -1 );
+        std::stringstream ss;
+        ss << "Error: source rows (" << this->_rows << ") and dest rows (" << devPlane._rows
+           << ") must be identical";
+        POP_FATAL(ss.str());
     }
     PitchPlane2D<T>::memcpyToDevice( devPlane, this->_cols, this->_rows, stream );
 }
diff --git a/src/popsift/common/sync_queue.h b/src/popsift/common/sync_queue.h
new file mode 100644
index 00000000..d8dfcbdf
--- /dev/null
+++ b/src/popsift/common/sync_queue.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+
+namespace popsift {
+
+/**
+ * @brief A thread safe wrapper around std::queue (replaces boost::sync_queue).
+ * @tparam T the value type that's stored in the queue.
+ */
+template<typename T>
+class SyncQueue {
+public:
+  SyncQueue() = default;
+
+  /**
+   * @brief Push an item onto the queue and signal it's available.
+   * @param[in] value the item to add to the queue.
+   */
+  void push(const T& value) {
+    std::unique_lock<std::mutex> lock(mtx_);
+    items_.push(value);
+    lock.unlock();
+    signal_.notify_one();
+  }
+
+  /**
+   * @brief Check if the queue is empty - thread safety via mutex.
+   * @return True if the queue is empty.
+   */
+  bool empty() {
+    std::unique_lock<std::mutex> lock(mtx_);
+    return items_.empty();
+  }
+
+  /**
+   * @brief Pull an item off the queue, or, wait until one arrives. Blocking.
+   * @return The front item that was popped off the queue.
+   */
+  T pull() {
+    std::unique_lock<std::mutex> lock(mtx_);
+    signal_.wait(lock, [this] { return !items_.empty(); });
+    auto ans = items_.front();
+    items_.pop();
+    return ans;
+  }
+
+private:
+  std::mutex mtx_;
+  std::queue<T> items_;
+  std::condition_variable signal_;
+};
+
+}  // namespace popsift
\ No newline at end of file
diff --git a/src/popsift/common/vec_macros.h b/src/popsift/common/vec_macros.h
index 4b476f2c..ee7741ed 100644
--- a/src/popsift/common/vec_macros.h
+++ b/src/popsift/common/vec_macros.h
@@ -7,7 +7,7 @@
  */
 #pragma once
 
-#include <math.h>
+#include <cmath>
 
 __device__ static inline
 float2 operator+( float2 l, const float2& r )
diff --git a/src/popsift/common/warp_bitonic_sort.h b/src/popsift/common/warp_bitonic_sort.h
index feb6fbd3..26ea5232 100644
--- a/src/popsift/common/warp_bitonic_sort.h
+++ b/src/popsift/common/warp_bitonic_sort.h
@@ -7,8 +7,9 @@
  */
 #pragma once
 
+#include "assist.h"
+
 #include <cuda_runtime.h>
-#include <iso646.h>
 
 namespace popsift {
 namespace BitonicSort {
@@ -57,15 +58,15 @@ class Warp32
     int shiftit( const int my_index, const int shift, const int direction, const bool increasing )
     {
         const T    my_val      = _array[my_index];
-        const T    other_val   = __shfl_xor( my_val, 1 << shift );
+        const T    other_val   = popsift::shuffle_xor( my_val, 1 << shift );
         const bool reverse     = ( threadIdx.x & ( 1 << direction ) );
         const bool id_less     = ( ( threadIdx.x & ( 1 << shift ) ) == 0 );
         const bool my_more     = id_less ? ( my_val > other_val )
                                          : ( my_val < other_val );
-        const bool must_swap   = not ( my_more ^ reverse ^ increasing );
+        const bool must_swap   = ! ( my_more ^ reverse ^ increasing );
 
-        return ( must_swap ? __shfl_xor( my_index, 1 << shift )
-                           : my_index );
+        int lane = must_swap ? ( 1 << shift ) : 0;
+        return popsift::shuffle_xor( my_index, lane );
     }
 
     __device__ inline
diff --git a/src/popsift/common/write_plane_2d.cu b/src/popsift/common/write_plane_2d.cu
index 6c9af8ac..68be7c4f 100755
--- a/src/popsift/common/write_plane_2d.cu
+++ b/src/popsift/common/write_plane_2d.cu
@@ -7,9 +7,9 @@
  */
 #include "write_plane_2d.h"
 
-#include <iostream>
-#include <iomanip>
 #include <fstream>
+#include <iomanip>
+#include <iostream>
 #include <limits>
 
 using namespace std;
diff --git a/src/popsift/features.cu b/src/popsift/features.cu
index 25fd1f57..5aa706a1 100755
--- a/src/popsift/features.cu
+++ b/src/popsift/features.cu
@@ -5,19 +5,18 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
-#include <iomanip>
-#include <iostream>
-#include <unistd.h>
-#ifndef __APPLE__
-#include <malloc.h>
-#endif
-#include <stdlib.h>
-#include <errno.h>
-#include <math_constants.h>
-
+#include "common/assist.h"
+#include "common/debug_macros.h"
 #include "features.h"
 #include "sift_extremum.h"
-#include "common/debug_macros.h"
+
+#include <math_constants.h>
+
+#include <cerrno>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
 
 using namespace std;
 
@@ -32,64 +31,52 @@ FeaturesBase::FeaturesBase( )
     , _num_ori( 0 )
 { }
 
-FeaturesBase::~FeaturesBase( )
-{ }
+FeaturesBase::~FeaturesBase( ) = default;
 
 /*************************************************************
  * FeaturesHost
  *************************************************************/
 
 FeaturesHost::FeaturesHost( )
-    : _ext( 0 )
-    , _ori( 0 )
+    : _ext( nullptr )
+    , _ori( nullptr )
 { }
 
 FeaturesHost::FeaturesHost( int num_ext, int num_ori )
-    : _ext( 0 )
-    , _ori( 0 )
+    : _ext( nullptr )
+    , _ori( nullptr )
 {
     reset( num_ext, num_ori );
 }
 
 FeaturesHost::~FeaturesHost( )
 {
-    free( _ext );
-    free( _ori );
-}
-
-#ifdef __APPLE__
-static void* memalign( size_t alignment, size_t size )
-{
-    void* ret;
-    int err = posix_memalign( &ret, alignment, size );
-    if( err != 0 ) {
-        errno = err;
-        ret = 0;
-    }
-    return ret;
+    memalign_free( _ext );
+    memalign_free( _ori );
 }
-#endif
 
 void FeaturesHost::reset( int num_ext, int num_ori )
 {
-    if( _ext != 0 ) { free( _ext ); _ext = 0; }
-    if( _ori != 0 ) { free( _ori ); _ori = 0; }
-
-    _ext = (Feature*)memalign( sysconf(_SC_PAGESIZE), num_ext * sizeof(Feature) );
-    if( _ext == 0 ) {
-        cerr << __FILE__ << ":" << __LINE__ << " Runtime error:" << endl
-             << "    Failed to (re)allocate memory for downloading " << num_ext << " features" << endl;
-        if( errno == EINVAL ) cerr << "    Alignment is not a power of two." << endl;
-        if( errno == ENOMEM ) cerr << "    Not enough memory." << endl;
-        exit( -1 );
+    if( _ext != nullptr ) { free( _ext ); _ext = nullptr; }
+    if( _ori != nullptr ) { free( _ori ); _ori = nullptr; }
+
+    _ext = (Feature*)memalign( getPageSize(), num_ext * sizeof(Feature) );
+    if( _ext == nullptr ) {
+        std::stringstream ss;
+        ss << "Runtime error:" << endl
+           << "    Failed to (re)allocate memory for downloading " << num_ext << " features" << endl;
+        if(errno == EINVAL) ss << "    Alignment is not a power of two.";
+        if(errno == ENOMEM) ss << "    Not enough memory.";
+        POP_FATAL(ss.str());
     }
-    _ori = (Descriptor*)memalign( sysconf(_SC_PAGESIZE), num_ori * sizeof(Descriptor) );
-    if( _ori == 0 ) {
-        cerr << __FILE__ << ":" << __LINE__ << " Runtime error:" << endl
-             << "    Failed to (re)allocate memory for downloading " << num_ori << " descriptors" << endl;
-        if( errno == EINVAL ) cerr << "    Alignment is not a power of two." << endl;
-        if( errno == ENOMEM ) cerr << "    Not enough memory." << endl;
-        exit( -1 );
+    _ori = (Descriptor*)memalign( getPageSize(), num_ori * sizeof(Descriptor) );
+    if(_ori == nullptr) {
+        std::stringstream ss;
+        ss << "Runtime error:" << endl
+           << "    Failed to (re)allocate memory for downloading " << num_ori << " descriptors" << endl;
+        if(errno == EINVAL) ss << "    Alignment is not a power of two.";
+        if(errno == ENOMEM) ss << "    Not enough memory.";
+        POP_FATAL(ss.str());
     }
 
     setFeatureCount( num_ext );
@@ -103,12 +90,16 @@ void FeaturesHost::pin( )
     if( err != cudaSuccess ) {
         cerr << __FILE__ << ":" << __LINE__ << " Runtime warning:" << endl
              << "    Failed to register feature memory in CUDA." << endl
+             << "    Features count: " << getFeatureCount() << endl
+             << "    Memory size requested: " << getFeatureCount() * sizeof(Feature) << endl
              << "    " << cudaGetErrorString(err) << endl;
     }
     err = cudaHostRegister( _ori, getDescriptorCount() * sizeof(Descriptor), 0 );
     if( err != cudaSuccess ) {
         cerr << __FILE__ << ":" << __LINE__ << " Runtime warning:" << endl
              << "    Failed to register descriptor memory in CUDA." << endl
+             << "    Descriptors count: " << getDescriptorCount() << endl
+             << "    Memory size requested: " << getDescriptorCount() * sizeof(Descriptor) << endl
              << "    " << cudaGetErrorString(err) << endl;
     }
 }
@@ -137,15 +128,15 @@ std::ostream& operator<<( std::ostream& ostr, const FeaturesHost& feature )
  *************************************************************/
 
 FeaturesDev::FeaturesDev( )
-    : _ext( 0 )
-    , _ori( 0 )
-    , _rev( 0 )
+    : _ext( nullptr )
+    , _ori( nullptr )
+    , _rev( nullptr )
 { }
 
 FeaturesDev::FeaturesDev( int num_ext, int num_ori )
-    : _ext( 0 )
-    , _ori( 0 )
-    , _rev( 0 )
+    : _ext( nullptr )
+    , _ori( nullptr )
+    , _rev( nullptr )
 {
     reset( num_ext, num_ori );
 }
@@ -159,9 +150,9 @@ FeaturesDev::~FeaturesDev( )
 
 void FeaturesDev::reset( int num_ext, int num_ori )
 {
-    if( _ext != 0 ) { cudaFree( _ext ); _ext = 0; }
-    if( _ori != 0 ) { cudaFree( _ori ); _ori = 0; }
-    if( _rev != 0 ) { cudaFree( _rev ); _rev = 0; }
+    if( _ext != nullptr ) { cudaFree( _ext ); _ext = nullptr; }
+    if( _ori != nullptr ) { cudaFree( _ori ); _ori = nullptr; }
+    if( _rev != nullptr ) { cudaFree( _rev ); _rev = nullptr; }
 
     _ext = popsift::cuda::malloc_devT<Feature>   ( num_ext, __FILE__, __LINE__ );
     _ori = popsift::cuda::malloc_devT<Descriptor>( num_ori, __FILE__, __LINE__ );
@@ -184,11 +175,11 @@ l2_in_t0( const float4* lptr, const float4* rptr )
 	        + mval.y * mval.y
 	        + mval.z * mval.z
 	        + mval.w * mval.w;
-    res += __shfl_down( res, 16 );
-    res += __shfl_down( res,  8 );
-    res += __shfl_down( res,  4 );
-    res += __shfl_down( res,  2 );
-    res += __shfl_down( res,  1 );
+    res += shuffle_down( res, 16 );
+    res += shuffle_down( res,  8 );
+    res += shuffle_down( res,  4 );
+    res += shuffle_down( res,  2 );
+    res += shuffle_down( res,  1 );
     return res;
 }
 
diff --git a/src/popsift/features.h b/src/popsift/features.h
index 21ad83a2..3b16f954 100755
--- a/src/popsift/features.h
+++ b/src/popsift/features.h
@@ -7,16 +7,17 @@
  */
 #pragma once
 
+#include "sift_constants.h"
+
 #include <iostream>
 #include <vector>
 
-#include "sift_constants.h"
-
 namespace popsift {
 
 struct Descriptor; // float features[128];
 
-/* This is a data structure that is returned to a calling program.
+/**
+ * @brief This is a data structure that is returned to a calling program.
  * The xpos/ypos information in feature is scale-adapted.
  */
 struct Feature
@@ -24,9 +25,11 @@ struct Feature
     int         debug_octave;
     float       xpos;
     float       ypos;
-    float       sigma;   // scale;
-    int         num_ori; // number of this extremum's orientations
-                         // remaining entries in desc are 0
+    /// scale
+    float       sigma;
+    /// number of this extremum's orientations
+    /// remaining entries in desc are 0
+    int         num_ori;
     float       orientation[ORIENTATION_MAX_COUNT];
     Descriptor* desc[ORIENTATION_MAX_COUNT];
 
@@ -52,7 +55,8 @@ class FeaturesBase
     inline void    setDescriptorCount( int num_ori ) { _num_ori = num_ori; }
 };
 
-/* This is a data structure that is returned to a calling program.
+/**
+ * @brief This is a data structure that is returned to a calling program.
  * _ori is a transparent flat memory holding descriptors
  * that are referenced by the extrema.
  *
@@ -70,7 +74,7 @@ class FeaturesHost : public FeaturesBase
 public:
     FeaturesHost( );
     FeaturesHost( int num_ext, int num_ori );
-    virtual ~FeaturesHost( );
+    ~FeaturesHost( ) override;
 
     typedef Feature*       F_iterator;
     typedef const Feature* F_const_iterator;
@@ -93,7 +97,7 @@ class FeaturesHost : public FeaturesBase
     friend class Pyramid;
 };
 
-typedef FeaturesHost Features;
+using Features = FeaturesHost;
 
 std::ostream& operator<<( std::ostream& ostr, const FeaturesHost& feature );
 
@@ -106,7 +110,7 @@ class FeaturesDev : public FeaturesBase
 public:
     FeaturesDev( );
     FeaturesDev( int num_ext, int num_ori );
-    virtual ~FeaturesDev( );
+    ~FeaturesDev( ) override;
 
     void reset( int num_ext, int num_ori );
 
diff --git a/src/popsift/gauss_filter.cu b/src/popsift/gauss_filter.cu
index 436bf455..7c425f7f 100755
--- a/src/popsift/gauss_filter.cu
+++ b/src/popsift/gauss_filter.cu
@@ -5,11 +5,11 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
-#include <stdio.h>
-#include <algorithm>
-
-#include "gauss_filter.h"
 #include "common/debug_macros.h"
+#include "gauss_filter.h"
+
+#include <algorithm>
+#include <cstdio>
 
 using namespace std;
 
@@ -18,7 +18,7 @@ namespace popsift {
 __device__ __constant__
 GaussInfo d_gauss;
 
-__align__(128) GaussInfo h_gauss;
+__align__(128) thread_local GaussInfo h_gauss;
 
 
 __global__
@@ -130,17 +130,17 @@ void init_filter( const Config& conf,
 {
     if( sigma0 > 2.0 )
     {
-        cerr << __FILE__ << ":" << __LINE__ << ", ERROR: "
-             << " Sigma > 2.0 is not supported. Re-size __constant__ array and recompile."
-             << endl;
-        exit( -__LINE__ );
+        stringstream ss;
+        ss << "ERROR: "
+           << " Sigma > 2.0 is not supported. Re-size __constant__ array and recompile.";
+        POP_FATAL(ss.str());
     }
     if( levels > GAUSS_LEVELS )
     {
-        cerr << __FILE__ << ":" << __LINE__ << ", ERROR: "
-             << " More than " << GAUSS_LEVELS << " levels not supported. Re-size __constant__ array and recompile."
-             << endl;
-        exit( -__LINE__ );
+        stringstream ss;
+        ss << "ERROR: "
+           << " More than " << GAUSS_LEVELS << " levels not supported. Re-size __constant__ array and recompile.";
+        POP_FATAL(ss.str());
     }
 
     if( conf.ifPrintGaussTables() ) {
@@ -291,10 +291,9 @@ int GaussInfo::getSpan( float sigma ) const
     case Config::Fixed15 :
         return 8;
     default :
-        cerr << __FILE__ << ":" << __LINE__ << ", ERROR: "
-             << " The mode for computing Gauss filter scan is invalid"
-             << endl;
-        exit( -__LINE__ );
+        stringstream ss;
+        ss << "ERROR: The mode for computing Gauss filter scan is invalid";
+        POP_FATAL(ss.str());
     }
 }
 
@@ -377,7 +376,7 @@ void GaussTable<LEVELS>::transformBlurTable( )
 {
     for( int level=0; level<LEVELS; level++ ) {
         i_span[level] = span[level];
-        if( not ( i_span[level] & 1 ) ) {
+        if( ! ( i_span[level] & 1 ) ) {
             i_span[level] += 1;
         }
     }
diff --git a/src/popsift/gauss_filter.h b/src/popsift/gauss_filter.h
index 7f32e026..db1a8c25 100755
--- a/src/popsift/gauss_filter.h
+++ b/src/popsift/gauss_filter.h
@@ -7,8 +7,8 @@
  */
 #pragma once
 
-#include "sift_constants.h"
 #include "sift_conf.h"
+#include "sift_constants.h"
 
 namespace popsift {
 
@@ -105,7 +105,7 @@ struct GaussInfo
 };
 
 extern __device__ __constant__ GaussInfo d_gauss;
-extern                         GaussInfo h_gauss;
+extern thread_local            GaussInfo h_gauss;
 
 /* init_filter must be called early to initialize the Gauss tables.
  */
diff --git a/src/popsift/popsift.cu b/src/popsift/popsift.cu
index fdb95d51..09575772 100755
--- a/src/popsift/popsift.cu
+++ b/src/popsift/popsift.cu
@@ -5,23 +5,30 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
-#include <fstream>
-#include <pthread.h> // for pthread_self
-
-#include "sift_constants.h"
+#include <cmath>
+#include <cstring>
 #include "popsift.h"
+
 #include "gauss_filter.h"
-#include "common/write_plane_2d.h"
+#include "sift_config.h"
 #include "sift_pyramid.h"
-#include "sift_extremum.h"
-#include "common/assist.h"
-#include "features.h"
+#include "common/debug_macros.h"
+
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <stdexcept>
 
 using namespace std;
 
-PopSift::PopSift( const popsift::Config& config, popsift::Config::ProcessingMode mode, ImageMode imode )
+PopSift::PopSift( const popsift::Config& config, popsift::Config::ProcessingMode mode, ImageMode imode, int device )
     : _image_mode( imode )
+    , _device(device)
 {
+    cudaSetDevice(_device);
+    configure(config);
+
     if( imode == ByteImages )
     {
         _pipe._unused.push( new popsift::Image);
@@ -32,20 +39,20 @@ PopSift::PopSift( const popsift::Config& config, popsift::Config::ProcessingMode
         _pipe._unused.push( new popsift::ImageFloat );
         _pipe._unused.push( new popsift::ImageFloat );
     }
-    _pipe._pyramid    = 0;
-
-    configure( config, true );
 
-    _pipe._thread_stage1 = new boost::thread( &PopSift::uploadImages, this );
+    _pipe._thread_stage1.reset( new std::thread( &PopSift::uploadImages, this ));
     if( mode == popsift::Config::ExtractingMode )
-        _pipe._thread_stage2 = new boost::thread( &PopSift::extractDownloadLoop, this );
+        _pipe._thread_stage2.reset( new std::thread( &PopSift::extractDownloadLoop, this ));
     else
-        _pipe._thread_stage2 = new boost::thread( &PopSift::matchPrepareLoop, this );
+        _pipe._thread_stage2.reset( new std::thread( &PopSift::matchPrepareLoop, this ));
 }
 
-PopSift::PopSift( ImageMode imode )
+PopSift::PopSift( ImageMode imode, int device )
     : _image_mode( imode )
+    , _device(device)
 {
+    cudaSetDevice(_device);
+
     if( imode == ByteImages )
     {
         _pipe._unused.push( new popsift::Image);
@@ -56,26 +63,33 @@ PopSift::PopSift( ImageMode imode )
         _pipe._unused.push( new popsift::ImageFloat );
         _pipe._unused.push( new popsift::ImageFloat );
     }
-    _pipe._pyramid    = 0;
 
-    _pipe._thread_stage1 = new boost::thread( &PopSift::uploadImages, this );
-    _pipe._thread_stage2 = new boost::thread( &PopSift::extractDownloadLoop, this );
+    _pipe._thread_stage1.reset( new std::thread( &PopSift::uploadImages, this ));
+    _pipe._thread_stage2.reset( new std::thread( &PopSift::extractDownloadLoop, this ));
 }
 
 PopSift::~PopSift()
 {
+    if(_isInit)
+    {
+        uninit();
+    }
 }
 
-bool PopSift::configure( const popsift::Config& config, bool force )
+bool PopSift::configure( const popsift::Config& config, bool /*force*/ )
 {
-    if( _pipe._pyramid != 0 ) {
+    if( _pipe._pyramid != nullptr ) {
         return false;
     }
 
     _config = config;
-
     _config.levels = max( 2, config.levels );
 
+    return true;
+}
+
+bool PopSift::applyConfiguration(bool force)
+{
     if( force || ( _config  != _shadow_config ) )
     {
         popsift::init_filter( _config,
@@ -92,10 +106,8 @@ bool PopSift::configure( const popsift::Config& config, bool force )
     return true;
 }
 
-bool PopSift::private_init( int w, int h )
+void PopSift::private_apply_scale_factor( int& w, int& h )
 {
-    Pipe& p = _pipe;
-
     /* up=-1 -> scale factor=2
      * up= 0 -> scale factor=1
      * up= 1 -> scale factor=0.5
@@ -103,55 +115,149 @@ bool PopSift::private_init( int w, int h )
     float upscaleFactor = _config.getUpscaleFactor();
     float scaleFactor = 1.0f / powf( 2.0f, -upscaleFactor );
 
-    if( p._pyramid != 0 ) {
-        p._pyramid->resetDimensions( _config,
-                                     ceilf( w * scaleFactor ),
-                                     ceilf( h * scaleFactor ) );
-        return true;
-    }
-
     if( _config.octaves < 0 ) {
-        int oct = _config.octaves;
-        oct = max(int (floor( logf( (float)min( w, h ) )
+        int oct = max(int (floor( logf( (float)min( w, h ) )
                             / logf( 2.0f ) ) - 3.0f + scaleFactor ), 1);
         _config.octaves = oct;
     }
 
-    p._pyramid = new popsift::Pyramid( _config,
-                                       ceilf( w * scaleFactor ),
-                                       ceilf( h * scaleFactor ) );
+    w = ceilf( w * scaleFactor );
+    h = ceilf( h * scaleFactor );
+}
+
+bool PopSift::private_init( int w, int h )
+{
+    Pipe& p = _pipe;
+
+    private_apply_scale_factor( w, h );
+
+    if( p._pyramid != nullptr ) {
+        p._pyramid->resetDimensions( _config, w, h );
+        return true;
+    }
+
+    p._pyramid = new popsift::Pyramid( _config, w, h );
 
     cudaDeviceSynchronize();
 
     return true;
 }
 
+bool PopSift::private_uninit()
+{
+    Pipe& p = _pipe;
+
+    delete p._pyramid;
+    p._pyramid = nullptr;
+
+    return true;
+}
+
 void PopSift::uninit( )
 {
-    _pipe._queue_stage1.push( 0 );
-    _pipe._thread_stage2->join();
-    _pipe._thread_stage1->join();
-    delete _pipe._thread_stage2;
-    delete _pipe._thread_stage1;
+    if(!_isInit)
+    {
+        std::cerr << "[warning] Attempt to release resources from an uninitialized instance" << std::endl;
+        return;
+    }
+    _pipe.uninit();
 
-    while( !_pipe._unused.empty() ) {
-        popsift::ImageBase* img = _pipe._unused.pull();
-        delete img;
+    _isInit = false;
+}
+
+PopSift::AllocTest PopSift::testTextureFit( int width, int height )
+{
+    const bool warn = popsift::cuda::device_prop_t::dont_warn;
+    bool retval = _device_properties.checkLimit_2DtexLinear( width,
+                                                        height,
+                                                        warn );
+    if( !retval )
+    {
+        return AllocTest::ImageExceedsLinearTextureLimit;
     }
 
-    delete _pipe._pyramid;
-    _pipe._pyramid    = 0;
+
+    /* Scale the width and height - we need that size for the largest
+     * octave. */
+    private_apply_scale_factor( width, height );
+
+    /* _config.level does not contain the 3 blur levels beyond the first
+     * that is required for downscaling to the following octave.
+     * We need all layers to check if we can support enough layers.
+     */
+    int depth = _config.levels + 3;
+
+    retval = _device_properties.checkLimit_2DsurfLayered( width,
+                                                          height,
+                                                          depth,
+                                                          warn );
+
+    return (retval ? AllocTest::Ok : AllocTest::ImageExceedsLayeredSurfaceLimit);
 }
 
+std::string PopSift::testTextureFitErrorString( AllocTest err, int width, int height )
+{
+    ostringstream ostr;
+
+    switch( err )
+    {
+        case AllocTest::Ok :
+            ostr << "?    No error." << endl;
+            break;
+        case AllocTest::ImageExceedsLinearTextureLimit :
+            _device_properties.checkLimit_2DtexLinear( width, height, false );
+            ostr << "E    Cannot load unscaled image. " << endl
+                 << "E    It exceeds the max CUDA linear texture size. " << endl
+                 << "E    Max is (" << width << "," << height << ")" << endl;
+            break;
+        case AllocTest::ImageExceedsLayeredSurfaceLimit :
+            {
+                const float upscaleFactor = _config.getUpscaleFactor();
+                const float scaleFactor = 1.0f / powf( 2.0f, -upscaleFactor );
+                int w = ceilf( width  * scaleFactor );
+                int h = ceilf( height * scaleFactor );
+                int d = _config.levels + 3;
+
+                _device_properties.checkLimit_2DsurfLayered( w, h, d, false );
+
+                w = w / scaleFactor;
+                h = h / scaleFactor;
+                ostr << "E    Cannot use"
+                     << (upscaleFactor==1 ? " default " : " ")
+                     << "downscaling factor " << -upscaleFactor
+                     << " (i.e. upscaling by " << pow(2,upscaleFactor) << "). "
+                     << endl
+                     << "E    It exceeds the max CUDA layered surface size. " << endl
+                     << "E    Change downscaling to fit into (" << w << "," << h
+                     << ") with " << (d-3) << " levels per octave." << endl;
+            }
+            break;
+        default:
+            ostr << "E    Programming error, please report." << endl;
+            break;
+    }
+    return ostr.str();
+}
+
+
 SiftJob* PopSift::enqueue( int                  w,
                            int                  h,
                            const unsigned char* imageData )
 {
     if( _image_mode != ByteImages )
     {
-        cerr << __FILE__ << ":" << __LINE__ << " Image mode error" << endl
-             << "E    Cannot load byte images into a PopSift pipeline configured for float images" << endl;
-        exit( -1 );
+        stringstream ss;
+        ss << "Image mode error" << endl
+           << "E    Cannot load byte images into a PopSift pipeline configured for float images";
+        POP_FATAL(ss.str());
+    }
+
+    AllocTest a = testTextureFit( w, h );
+    if( a != AllocTest::Ok )
+    {
+        cerr << __FILE__ << ":" << __LINE__ << " Image too large" << endl
+             << testTextureFitErrorString( a,w,h );
+        return nullptr;
     }
 
     SiftJob* job = new SiftJob( w, h, imageData );
@@ -165,9 +271,18 @@ SiftJob* PopSift::enqueue( int          w,
 {
     if( _image_mode != FloatImages )
     {
-        cerr << __FILE__ << ":" << __LINE__ << " Image mode error" << endl
-             << "E    Cannot load float images into a PopSift pipeline configured for byte images" << endl;
-        exit( -1 );
+        stringstream ss;
+        ss << "Image mode error" << endl
+           << "E    Cannot load float images into a PopSift pipeline configured for byte images";
+        POP_FATAL(ss.str());
+    }
+
+    AllocTest a = testTextureFit( w, h );
+    if( a != AllocTest::Ok )
+    {
+        cerr << __FILE__ << ":" << __LINE__ << " Image too large" << endl
+             << testTextureFitErrorString( a,w,h );
+        return nullptr;
     }
 
     SiftJob* job = new SiftJob( w, h, imageData );
@@ -177,21 +292,28 @@ SiftJob* PopSift::enqueue( int          w,
 
 void PopSift::uploadImages( )
 {
+    cudaSetDevice(_device);
+
     SiftJob* job;
-    while( ( job = _pipe._queue_stage1.pull() ) != 0 ) {
+    while( ( job = _pipe._queue_stage1.pull() ) != nullptr ) {
         popsift::ImageBase* img = _pipe._unused.pull();
         job->setImg( img );
         _pipe._queue_stage2.push( job );
     }
-    _pipe._queue_stage2.push( 0 );
+    _pipe._queue_stage2.push( nullptr );
 }
 
 void PopSift::extractDownloadLoop( )
 {
+    cudaSetDevice(_device);
+    applyConfiguration(true);
+
     Pipe& p = _pipe;
 
     SiftJob* job;
-    while( ( job = p._queue_stage2.pull() ) != 0 ) {
+    while( ( job = p._queue_stage2.pull() ) != nullptr ) {
+        applyConfiguration();
+
         popsift::ImageBase* img = job->getImg();
 
         private_init( img->getWidth(), img->getHeight() );
@@ -207,11 +329,9 @@ void PopSift::extractDownloadLoop( )
 
         bool log_to_file = ( _config.getLogMode() == popsift::Config::All );
         if( log_to_file ) {
-            int octaves = p._pyramid->getNumOctaves();
-
+            // int octaves = p._pyramid->getNumOctaves();
             // for( int o=0; o<octaves; o++ ) { p._pyramid->download_descriptors( _config, o ); }
-
-            int levels  = p._pyramid->getNumLevels();
+            // int levels  = p._pyramid->getNumLevels();
 
             p._pyramid->download_and_save_array( "pyramid" );
             p._pyramid->save_descriptors( _config, features, "pyramid" );
@@ -219,68 +339,94 @@ void PopSift::extractDownloadLoop( )
 
         job->setFeatures( features );
     }
+
+    private_uninit();
 }
 
 void PopSift::matchPrepareLoop( )
 {
+    cudaSetDevice(_device);
+    applyConfiguration(true);
+
     Pipe& p = _pipe;
 
     SiftJob* job;
-    while( ( job = p._queue_stage2.pull() ) != 0 ) {
-        popsift::ImageBase* img = job->getImg();
+    while( ( job = p._queue_stage2.pull() ) != nullptr ) {
+        popsift::FeaturesDev* features;
+        try
+        {
+            applyConfiguration();
 
-        private_init( img->getWidth(), img->getHeight() );
+            popsift::ImageBase* img = job->getImg();
 
-        p._pyramid->step1( _config, img );
-        p._unused.push( img ); // uploaded input image no longer needed, release for reuse
+            private_init(img->getWidth(), img->getHeight());
 
-        p._pyramid->step2( _config );
+            p._pyramid->step1(_config, img);
+            p._unused.push(img); // uploaded input image no longer needed, release for reuse
 
-        popsift::FeaturesDev* features = p._pyramid->clone_device_descriptors( _config );
+            p._pyramid->step2(_config);
 
-        cudaDeviceSynchronize();
+            features = p._pyramid->clone_device_descriptors(_config);
+            cudaDeviceSynchronize();
+        }
+        catch(const std::exception& e)
+        {
+            job->setError(std::current_exception());
+            job->setFeatures(nullptr);
+            break;
+        }
 
         job->setFeatures( features );
     }
+
+    private_uninit();
 }
 
 SiftJob::SiftJob( int w, int h, const unsigned char* imageData )
     : _w(w)
     , _h(h)
-    , _img(0)
+    , _img(nullptr)
 {
     _f = _p.get_future();
 
     _imageData = (unsigned char*)malloc( w*h );
-    if( _imageData != 0 ) {
+    if( _imageData != nullptr )
+    {
         memcpy( _imageData, imageData, w*h );
-    } else {
-        cerr << __FILE__ << ":" << __LINE__ << " Memory limitation" << endl
-             << "E    Failed to allocate memory for SiftJob" << endl;
-        exit( -1 );
+    }
+    else
+    {
+        stringstream ss;
+        ss << "Memory limitation" << endl
+           << "E    Failed to allocate memory for SiftJob";
+        POP_FATAL(ss.str());
     }
 }
 
 SiftJob::SiftJob( int w, int h, const float* imageData )
     : _w(w)
     , _h(h)
-    , _img(0)
+    , _img(nullptr)
 {
     _f = _p.get_future();
 
     _imageData = (unsigned char*)malloc( w*h*sizeof(float) );
-    if( _imageData != 0 ) {
+    if( _imageData != nullptr )
+    {
         memcpy( _imageData, imageData, w*h*sizeof(float) );
-    } else {
-        cerr << __FILE__ << ":" << __LINE__ << " Memory limitation" << endl
-             << "E    Failed to allocate memory for SiftJob" << endl;
-        exit( -1 );
+    }
+    else
+    {
+        stringstream ss;
+        ss << "Memory limitation" << endl
+           << "E    Failed to allocate memory for SiftJob";
+        POP_FATAL(ss.str());
     }
 }
 
 SiftJob::~SiftJob( )
 {
-    delete [] _imageData;
+    free( _imageData );
 }
 
 void SiftJob::setImg( popsift::ImageBase* img )
@@ -292,7 +438,7 @@ void SiftJob::setImg( popsift::ImageBase* img )
 
 popsift::ImageBase* SiftJob::getImg()
 {
-#ifdef USE_NVTX
+#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
     _nvtx_id = nvtxRangeStartA( "inserting image" );
 #endif
     return _img;
@@ -301,7 +447,7 @@ popsift::ImageBase* SiftJob::getImg()
 void SiftJob::setFeatures( popsift::FeaturesBase* f )
 {
     _p.set_value( f );
-#ifdef USE_NVTX
+#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
     nvtxRangeEnd( _nvtx_id );
 #endif
 }
@@ -323,6 +469,35 @@ popsift::FeaturesHost* SiftJob::getHost()
 
 popsift::FeaturesDev* SiftJob::getDev()
 {
-    return dynamic_cast<popsift::FeaturesDev*>( _f.get() );
+    popsift::FeaturesBase* features = _f.get();
+    if(this->_err != nullptr) {
+        std::rethrow_exception(this->_err);
+    }
+    return dynamic_cast<popsift::FeaturesDev*>(features);
 }
 
+void SiftJob::setError(std::exception_ptr ptr)
+{
+    this->_err = ptr;
+}
+
+void PopSift::Pipe::uninit()
+{
+    _queue_stage1.push( nullptr );
+    if(_thread_stage2 != nullptr)
+    {
+        _thread_stage2->join();
+        _thread_stage2.reset(nullptr);
+    }
+    if(_thread_stage1 != nullptr)
+    {
+        _thread_stage1->join();
+        _thread_stage1.reset(nullptr);
+    }
+
+    while( !_unused.empty() )
+    {
+        popsift::ImageBase* img = _unused.pull();
+        delete img;
+    }
+}
diff --git a/src/popsift/popsift.h b/src/popsift/popsift.h
index 9f410dfb..5654cc76 100755
--- a/src/popsift/popsift.h
+++ b/src/popsift/popsift.h
@@ -7,20 +7,24 @@
  */
 #pragma once
 
-#include <cuda_runtime.h>
-#include <vector>
-#include <stack>
-#include <queue>
-#include <future>
-#include <boost/thread/thread.hpp>
-#include <boost/thread/sync_queue.hpp>
-
+#include "common/sync_queue.h"
+#include "common/device_prop.h"
 #include "sift_conf.h"
+#include "sift_config.h"
 #include "sift_extremum.h"
 
+#include <cuda_runtime.h>
+
+#include <exception>
+#include <future>
+#include <queue>
+#include <stack>
+#include <stdexcept>
+#include <thread>
+#include <vector>
 
-#ifdef USE_NVTX
-#include <nvToolsExtCuda.h>
+#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
+#include <nvtx3/nvToolsExtCuda.h>
 #else
 #define nvtxRangeStartA(a)
 #define nvtxRangeEnd(a)
@@ -45,21 +49,44 @@ class SiftJob
     int                 _h;
     unsigned char*      _imageData;
     popsift::ImageBase* _img;
-#ifdef USE_NVTX
+    std::exception_ptr _err;
+#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
     nvtxRangeId_t       _nvtx_id;
 #endif
 
 public:
-    /** Constructor for byte images, value range 0..255 */
+
+    /**
+     * @brief Constructor for byte images, value range 0..255
+     * @param[in] w the width in pixel of the image
+     * @param[in] h the height in pixel of the image
+     * @param[in] imageData the image buffer
+     */
     SiftJob( int w, int h, const unsigned char* imageData );
 
-    /** Constructor for float images, value range [0..1[ */
+    /**
+     * @brief Constructor for float images, value range [0..1[
+     * @param[in] w the width in pixel of the image
+     * @param[in] h the height in pixel of the image
+     * @param[in] imageData the image buffer
+     */
     SiftJob( int w, int h, const float* imageData );
 
+    /**
+     * @brief Destructor releases all the resources.
+     */
     ~SiftJob( );
 
-    popsift::FeaturesHost* get();    // should be deprecated, same as getHost()
+    /**
+     * @deprecated
+     * @see getHost()
+     */
+    popsift::FeaturesHost* get();
     popsift::FeaturesBase* getBase();
+    /**
+     * @brief
+     * @return
+     */
     popsift::FeaturesHost* getHost();
     popsift::FeaturesDev*  getDev();
 
@@ -68,81 +95,197 @@ class SiftJob
 
     /** fulfill the promise */
     void setFeatures( popsift::FeaturesBase* f );
+
+    void setError(std::exception_ptr ptr);
 };
 
+/**
+ * @brief
+ */
 class PopSift
 {
     struct Pipe
     {
-        boost::thread*                         _thread_stage1;
-        boost::thread*                         _thread_stage2;
-        boost::sync_queue<SiftJob*>            _queue_stage1;
-        boost::sync_queue<SiftJob*>            _queue_stage2;
-        boost::sync_queue<popsift::ImageBase*> _unused;
-        popsift::ImageBase*                    _current;
-
-        popsift::Pyramid*                      _pyramid;
+        std::unique_ptr<std::thread>            _thread_stage1;
+        std::unique_ptr<std::thread>            _thread_stage2;
+        popsift::SyncQueue<SiftJob*>            _queue_stage1;
+        popsift::SyncQueue<SiftJob*>            _queue_stage2;
+        popsift::SyncQueue<popsift::ImageBase*> _unused;
+
+        popsift::Pyramid*                      _pyramid{nullptr};
+
+        /**
+         * @brief Release the allocated resources, if any.
+         */
+        void uninit();
     };
 
 public:
+
+    /**
+    * @brief Image modes
+    */
     enum ImageMode
     {
+        ///  byte image, value range 0..255
         ByteImages,
+        /// float images, value range [0..1[
         FloatImages
     };
 
+    /**
+     * @brief Results for the allocation test.
+     */
+    enum AllocTest
+    {
+        /// the image dimensions are supported by this device's CUDA texture engine.
+        Ok,
+        /// the input image size exceeds the dimensions of the CUDA Texture used for loading.
+        ImageExceedsLinearTextureLimit,
+        /// the scaled input image exceeds the dimensions of the CUDA Surface used for the image pyramid.
+        ImageExceedsLayeredSurfaceLimit
+    };
+
 public:
-    /* We support more than 1 streams, but we support only one sigma and one
+
+    PopSift() = delete;
+    PopSift(const PopSift&) = delete;
+
+    /**
+     * @brief We support more than 1 streams, but we support only one sigma and one
      * level parameters.
      */
-    PopSift( ImageMode imode = ByteImages );
-    PopSift( const popsift::Config&          config,
-             popsift::Config::ProcessingMode mode  = popsift::Config::ExtractingMode,
-             ImageMode                       imode = ByteImages );
+    explicit PopSift( ImageMode imode = ByteImages, int device = 0 );
+
+    /**
+     * @brief
+     * @param config
+     * @param mode
+     * @param imode
+     */
+    explicit PopSift(const popsift::Config& config,
+                     popsift::Config::ProcessingMode mode = popsift::Config::ExtractingMode,
+                     ImageMode imode = ByteImages, int device = 0);
+
+    /**
+     * @brief Release all the resources.
+     */
     ~PopSift();
 
 public:
-    /** Provide the configuration if you used the PopSift default
-     *  constructor */
+    /**
+     * @brief Provide the configuration if you used the PopSift default
+     *  constructor
+     */
     bool configure( const popsift::Config& config, bool force = false );
 
+    /**
+     * @brief Release the resources.
+     */
     void uninit( );
 
-    /** Enqueue a byte image,  value range 0..255 */
+    /**
+     *  @brief Check whether the current CUDA device can support the image
+     *  resolution (width,height) with the current configuration
+     *  based on the card's texture engine.
+     *  The function does not check if there is sufficient available
+     *  memory.
+     *
+     *  The first part of the test depends on the parameters width and
+     *  height. It checks whether the image size is supported by CUDA
+     *  2D linear textures on this card. This is used to load the image
+     *  into the first level of the first octave.
+     *  For the second part of the tst, two value of the configuration
+     *  are important: 
+     *  "downsampling", because it determines the required texture size
+     *  after loading. The CUDA 2D layered texture must support the
+     *  scaled width and height.
+     *  "levels", because it determines the number of levels in each
+     *  octave. The CUDA 2D layered texture must support enough depth
+     *  for each level.
+     *
+     * @param[in] width  The width of the input image
+     * @param[in] height The height of the input image
+     * @return AllocTest::Ok if the image dimensions are supported by this device's
+     *         CUDA texture engine,
+     *         AllocTest::ImageExceedsLinearTextureLimit if the input image size
+     *         exceeds the dimensions of the CUDA Texture used for loading.
+     *         The input image must be scaled.
+     *         AllocTest::ImageExceedsLayeredSurfaceLimit if the scaled input
+     *         image exceeds the dimensions of the CUDA Surface used for the
+     *         image pyramid. The scaling factor must be changes to fit in.
+     * @remark  * If you want to call configure() before extracting features,
+     *           you should call configure() before textTextureFit().
+     * @remark  * The current CUDA device is determined by a call to
+     *           cudaGetDevice(), card properties are only read once.
+     * @see AllocTest
+     */
+    AllocTest testTextureFit( int width, int height );
+
+    /**
+     * @brief Create a warning string for an AllocTest error code.
+     */
+    std::string testTextureFitErrorString( AllocTest err, int w, int h );
+
+    /**
+     * @brief Enqueue a byte image,  value range [0,255].
+     * @param[in] w the width of the image.
+     * @param[in] h the height of the image.
+     * @param[in] imageData the image buffer.
+     * @return the associated job
+     * @see SiftJob
+     */
     SiftJob*  enqueue( int                  w,
                        int                  h,
                        const unsigned char* imageData );
 
-    /** Enqueue a float image,  value range 0..1 */
+    /**
+     * @brief Enqueue a float image,  value range [0,1].
+     * @param[in] w the width of the image.
+     * @param[in] h the height of the image.
+     * @param[in] imageData the image buffer.
+     * @return the associated job
+     * @see SiftJob
+     */
     SiftJob*  enqueue( int          w,
                        int          h,
                        const float* imageData );
 
-    /** deprecated */
+    /**
+     * @deprecated
+     */
     inline void uninit( int /*pipe*/ ) { uninit(); }
 
-    /** deprecated */
+    /**
+     * @deprecated
+     */
     inline bool init( int /*pipe*/, int w, int h ) {
         _last_init_w = w;
         _last_init_h = h;
         return true;
     }
 
-    /** deprecated */
+    /**
+     * @deprecated
+     */
     inline popsift::FeaturesBase* execute( int /*pipe*/, const unsigned char* imageData )
     {
         SiftJob* j = enqueue( _last_init_w, _last_init_h, imageData );
-        if( !j ) return 0;
+        if( !j ) return nullptr;
         popsift::FeaturesBase* f = j->getBase();
         delete j;
         return f;
     }
 
 private:
+    bool applyConfiguration( bool force = false );
+
     bool private_init( int w, int h );
+    bool private_uninit( );
+    void private_apply_scale_factor( int& w, int& h );
     void uploadImages( );
 
-    /* The following method are alternative worker functions for Jobs submitted by
+    /* The following methods are alternative worker functions for Jobs submitted by
      * a calling application. The choice of method is made by the mode parameter
      * in the PopSift constructor. */
 
@@ -161,8 +304,15 @@ class PopSift
      */
     popsift::Config _shadow_config;
 
-    int             _last_init_w; /* to support depreacted interface */
-    int             _last_init_h; /* to support depreacted interface */
+    int             _last_init_w{}; /* to support deprecated interface */
+    int             _last_init_h{}; /* to support deprecated interface */
     ImageMode       _image_mode;
+    int             _device;
+
+    /// whether the object is initialized
+    bool            _isInit{true};
+
+    // Device property collection runs when this object is created
+    popsift::cuda::device_prop_t   _device_properties;
 };
 
diff --git a/src/popsift/s_desc_grid.cu b/src/popsift/s_desc_grid.cu
index 2d911609..099c4709 100644
--- a/src/popsift/s_desc_grid.cu
+++ b/src/popsift/s_desc_grid.cu
@@ -5,14 +5,13 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
-#include <stdio.h>
-#include <iso646.h>
-
-#include "sift_constants.h"
-#include "s_gradiant.h"
-#include "s_desc_grid.h"
 #include "common/assist.h"
 #include "common/vec_macros.h"
+#include "s_desc_grid.h"
+#include "s_gradiant.h"
+#include "sift_constants.h"
+
+#include <cstdio>
 
 using namespace popsift;
 
@@ -108,12 +107,12 @@ void ext_desc_grid_sub( const int           ix,
 
     /* reduction here */
     for (int i = 0; i < 8; i++) {
-        // dpt[i] += __shfl_down( dpt[i], 16 );
-        dpt[i] += __shfl_down( dpt[i], 8, 16 );
-        dpt[i] += __shfl_down( dpt[i], 4, 16 );
-        dpt[i] += __shfl_down( dpt[i], 2, 16 );
-        dpt[i] += __shfl_down( dpt[i], 1, 16 );
-        dpt[i]  = __shfl     ( dpt[i], 0, 16 );
+        // dpt[i] += popsift::shuffle_down( dpt[i], 16 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 8, 16 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 4, 16 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 2, 16 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 1, 16 );
+        dpt[i]  = popsift::shuffle     ( dpt[i], 0, 16 );
     }
 
 
@@ -122,9 +121,7 @@ void ext_desc_grid_sub( const int           ix,
     }
 }
 
-__global__
-void ext_desc_grid( const int           octave,
-                    cudaTextureObject_t layer_tex )
+__global__ void ext_desc_grid(int octave, cudaTextureObject_t layer_tex)
 {
     const int   o_offset =  dct.ori_ps[octave] + blockIdx.x;
     const int   ix       = threadIdx.y;
diff --git a/src/popsift/s_desc_grid.h b/src/popsift/s_desc_grid.h
index b674e1b5..c0919806 100644
--- a/src/popsift/s_desc_grid.h
+++ b/src/popsift/s_desc_grid.h
@@ -6,20 +6,18 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 #pragma once
-#include "sift_pyramid.h"
-#include "sift_octave.h"
-#include "sift_extremum.h"
-#include "common/plane_2d.h"
 #include "common/debug_macros.h"
+#include "common/plane_2d.h"
+#include "sift_extremum.h"
+#include "sift_octave.h"
+#include "sift_pyramid.h"
 
 /*
  * We assume that this is started with
  * block = 16,4,4
  * grid  = nunmber of orientations
  */
-__global__
-void ext_desc_grid( const int           octave,
-                    cudaTextureObject_t layer_tex );
+__global__ void ext_desc_grid(int octave, cudaTextureObject_t layer_tex);
 
 namespace popsift
 {
diff --git a/src/popsift/s_desc_igrid.cu b/src/popsift/s_desc_igrid.cu
index d1ed60bd..9f77f12f 100644
--- a/src/popsift/s_desc_igrid.cu
+++ b/src/popsift/s_desc_igrid.cu
@@ -5,14 +5,13 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
-#include <stdio.h>
-#include <iso646.h>
-
-#include "sift_constants.h"
-#include "s_gradiant.h"
-#include "s_desc_igrid.h"
 #include "common/assist.h"
 #include "common/vec_macros.h"
+#include "s_desc_igrid.h"
+#include "s_gradiant.h"
+#include "sift_constants.h"
+
+#include <cstdio>
 
 using namespace popsift;
 
@@ -64,10 +63,10 @@ void ext_desc_igrid_sub( const float x, const float y, const int level,
 
     /* reduction here */
     for (int i = 0; i < 8; i++) {
-        dpt[i] += __shfl_xor( dpt[i], 1, 16 );
-        dpt[i] += __shfl_xor( dpt[i], 2, 16 );
-        dpt[i] += __shfl_xor( dpt[i], 4, 16 );
-        dpt[i] += __shfl_xor( dpt[i], 8, 16 );
+        dpt[i] += popsift::shuffle_xor( dpt[i], 1, 16 );
+        dpt[i] += popsift::shuffle_xor( dpt[i], 2, 16 );
+        dpt[i] += popsift::shuffle_xor( dpt[i], 4, 16 );
+        dpt[i] += popsift::shuffle_xor( dpt[i], 8, 16 );
     }
 
     if( threadIdx.x < 8 ) {
@@ -75,9 +74,7 @@ void ext_desc_igrid_sub( const float x, const float y, const int level,
     }
 }
 
-__global__
-void ext_desc_igrid( const int           octave,
-                     cudaTextureObject_t texLinear )
+__global__ void ext_desc_igrid(int octave, cudaTextureObject_t texLinear)
 {
     const int   num      = dct.ori_ct[octave];
 
diff --git a/src/popsift/s_desc_igrid.h b/src/popsift/s_desc_igrid.h
index 589c5f4e..8980a4bc 100644
--- a/src/popsift/s_desc_igrid.h
+++ b/src/popsift/s_desc_igrid.h
@@ -6,19 +6,17 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 #pragma once
-#include "sift_pyramid.h"
-#include "sift_octave.h"
-#include "sift_extremum.h"
 #include "common/debug_macros.h"
+#include "sift_extremum.h"
+#include "sift_octave.h"
+#include "sift_pyramid.h"
 
 /*
  * We assume that this is started with
  * block = 16,4,4 or with 32,4,4, depending on macros
  * grid  = nunmber of orientations
  */
-__global__
-void ext_desc_igrid( const int           octave,
-                     cudaTextureObject_t texLinear );
+__global__ void ext_desc_igrid(int octave, cudaTextureObject_t texLinear);
 
 namespace popsift
 {
diff --git a/src/popsift/s_desc_iloop.cu b/src/popsift/s_desc_iloop.cu
index 5fb9436c..84673a20 100644
--- a/src/popsift/s_desc_iloop.cu
+++ b/src/popsift/s_desc_iloop.cu
@@ -5,14 +5,13 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
-#include <stdio.h>
-#include <iso646.h>
-
-#include "sift_constants.h"
-#include "s_gradiant.h"
-#include "s_desc_iloop.h"
 #include "common/assist.h"
 #include "common/vec_macros.h"
+#include "s_desc_iloop.h"
+#include "s_gradiant.h"
+#include "sift_constants.h"
+
+#include <cstdio>
 
 using namespace popsift;
 
@@ -116,12 +115,12 @@ void ext_desc_iloop_sub( const float         ang,
 
     /* reduction here */
     for (int i = 0; i < 8; i++) {
-        dpt[i] += __shfl_down( dpt[i], 16 );
-        dpt[i] += __shfl_down( dpt[i], 8 );
-        dpt[i] += __shfl_down( dpt[i], 4 );
-        dpt[i] += __shfl_down( dpt[i], 2 );
-        dpt[i] += __shfl_down( dpt[i], 1 );
-        dpt[i]  = __shfl     ( dpt[i], 0 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 16 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 8 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 4 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 2 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 1 );
+        dpt[i]  = popsift::shuffle     ( dpt[i], 0 );
     }
 
     if( threadIdx.x < 8 ) {
@@ -129,11 +128,7 @@ void ext_desc_iloop_sub( const float         ang,
     }
 }
 
-__global__
-void ext_desc_iloop( const int           octave,
-                     cudaTextureObject_t layer_tex,
-                     const int           w,
-                     const int           h )
+__global__ void ext_desc_iloop(int octave, cudaTextureObject_t layer_tex, int w, int h)
 {
     const int   o_offset =  dct.ori_ps[octave] + blockIdx.x;
     Descriptor* desc     = &dbuf.desc           [o_offset];
diff --git a/src/popsift/s_desc_iloop.h b/src/popsift/s_desc_iloop.h
index 643c85da..e69409b6 100644
--- a/src/popsift/s_desc_iloop.h
+++ b/src/popsift/s_desc_iloop.h
@@ -6,17 +6,13 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 #pragma once
-#include "sift_pyramid.h"
-#include "sift_octave.h"
-#include "sift_extremum.h"
-#include "common/plane_2d.h"
 #include "common/debug_macros.h"
+#include "common/plane_2d.h"
+#include "sift_extremum.h"
+#include "sift_octave.h"
+#include "sift_pyramid.h"
 
-__global__
-void ext_desc_iloop( const int           octave,
-                     cudaTextureObject_t layer_tex,
-                     const int           width,
-                     const int           height );
+__global__ void ext_desc_iloop(int octave, cudaTextureObject_t layer_tex, int width, int height);
 
 namespace popsift
 {
diff --git a/src/popsift/s_desc_loop.cu b/src/popsift/s_desc_loop.cu
index 55196b68..4c5f46c2 100644
--- a/src/popsift/s_desc_loop.cu
+++ b/src/popsift/s_desc_loop.cu
@@ -5,14 +5,13 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
-#include <stdio.h>
-#include <iso646.h>
-
-#include "sift_constants.h"
-#include "s_gradiant.h"
-#include "s_desc_loop.h"
 #include "common/assist.h"
 #include "common/vec_macros.h"
+#include "s_desc_loop.h"
+#include "s_gradiant.h"
+#include "sift_constants.h"
+
+#include <cstdio>
 
 using namespace popsift;
 
@@ -76,8 +75,10 @@ void ext_desc_loop_sub( const float         ang,
 
     float dpt[9] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
 
-    for( int i = threadIdx.x; i < loops; i+=blockDim.x )
+    for( int i = threadIdx.x; popsift::any(i < loops); i+=blockDim.x )
     {
+        if( i >= loops ) continue;
+
         const int ii = i / wx + ymin;
         const int jj = i % wx + xmin;     
 
@@ -111,25 +112,25 @@ void ext_desc_loop_sub( const float         ang,
             const float wgt2 = do0;
 
             int fo  = fo0 % DESC_BINS;
-
+    
                 // maf: multiply-add
                 // _ru - round to positive infinity equiv to froundf since always >=0
             dpt[fo]   = __fmaf_ru( wgt1, wgt, dpt[fo] );   // dpt[fo]   += (wgt1*wgt);
             dpt[fo+1] = __fmaf_ru( wgt2, wgt, dpt[fo+1] ); // dpt[fo+1] += (wgt2*wgt);
         }
-        __syncthreads();
     }
+    __syncthreads();
 
     dpt[0] += dpt[8];
 
     /* reduction here */
     for (int i = 0; i < 8; i++) {
-        dpt[i] += __shfl_down( dpt[i], 16 );
-        dpt[i] += __shfl_down( dpt[i], 8 );
-        dpt[i] += __shfl_down( dpt[i], 4 );
-        dpt[i] += __shfl_down( dpt[i], 2 );
-        dpt[i] += __shfl_down( dpt[i], 1 );
-        dpt[i]  = __shfl     ( dpt[i], 0 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 16 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 8 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 4 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 2 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 1 );
+        dpt[i]  = popsift::shuffle     ( dpt[i], 0 );
     }
 
     if( threadIdx.x < 8 ) {
@@ -137,11 +138,7 @@ void ext_desc_loop_sub( const float         ang,
     }
 }
 
-__global__
-void ext_desc_loop( const int           octave,
-                    cudaTextureObject_t layer_tex,
-                    const int           w,
-                    const int           h )
+__global__ void ext_desc_loop(int octave, cudaTextureObject_t layer_tex, int w, int h)
 {
     const int   o_offset =  dct.ori_ps[octave] + blockIdx.x;
     Descriptor* desc     = &dbuf.desc           [o_offset];
diff --git a/src/popsift/s_desc_loop.h b/src/popsift/s_desc_loop.h
index 8a4e756b..600db498 100644
--- a/src/popsift/s_desc_loop.h
+++ b/src/popsift/s_desc_loop.h
@@ -6,19 +6,15 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 #pragma once
-#include "sift_pyramid.h"
-#include "sift_octave.h"
-#include "sift_extremum.h"
-#include "common/plane_2d.h"
 #include "common/debug_macros.h"
+#include "common/plane_2d.h"
+#include "sift_extremum.h"
+#include "sift_octave.h"
+#include "sift_pyramid.h"
 
 #undef BLOCK_3_DIMS
 
-__global__
-void ext_desc_loop( const int           octave,
-                    cudaTextureObject_t layer_tex,
-                    const int           width,
-                    const int           height );
+__global__ void ext_desc_loop(int octave, cudaTextureObject_t layer_tex, int width, int height);
 
 namespace popsift
 {
diff --git a/src/popsift/s_desc_norm_l2.h b/src/popsift/s_desc_norm_l2.h
index 79f53f7a..b067d71f 100644
--- a/src/popsift/s_desc_norm_l2.h
+++ b/src/popsift/s_desc_norm_l2.h
@@ -6,7 +6,9 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 #pragma once
+#include "common/assist.h"
 #include "s_desc_normalize.h"
+#include "sift_config.h"
 
 using namespace popsift;
 using namespace std;
@@ -48,85 +50,61 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
     float4 descr;
     descr = ptr4[threadIdx.x];
 
-#if __CUDACC_VER__ >= 70500
-    // normf() is an elegant function: sqrt(sum_0^127{v^2})
-    // It exists from CUDA 7.5 but the trouble with CUB on the GTX 980 Ti forces
-    // us to with CUDA 7.0 right now
-
     float norm;
 
-    if( threadIdx.x == 0 ) {
-        norm = normf( 128, src_desc );
-    }
-    __syncthreads();
-    norm = __shfl( norm, 0 );
-
-    descr.x = min( descr.x, 0.2f*norm );
-    descr.y = min( descr.y, 0.2f*norm );
-    descr.z = min( descr.z, 0.2f*norm );
-    descr.w = min( descr.w, 0.2f*norm );
-
+    // 32 threads compute 4 squares each, then shuffle to performing a addition by
+    // reduction for the sum of 128 squares, result in thread 0
     norm = descr.x * descr.x
          + descr.y * descr.y
          + descr.z * descr.z
          + descr.w * descr.w;
-    norm += __shfl_down( norm, 16 );
-    norm += __shfl_down( norm,  8 );
-    norm += __shfl_down( norm,  4 );
-    norm += __shfl_down( norm,  2 );
-    norm += __shfl_down( norm,  1 );
-    if( threadIdx.x == 0 ) {
-        // norm = __fsqrt_rn( norm );
-        // norm = __fdividef( 512.0f, norm );
-        norm = __frsqrt_rn( norm ); // inverse square root
-        norm = scalbnf( norm, d_consts.norm_multi );
-    }
-#else
-    float norm;
+    norm += popsift::shuffle_down( norm, 16 );
+    norm += popsift::shuffle_down( norm,  8 );
+    norm += popsift::shuffle_down( norm,  4 );
+    norm += popsift::shuffle_down( norm,  2 );
+    norm += popsift::shuffle_down( norm,  1 );
 
-    norm = descr.x * descr.x
-         + descr.y * descr.y
-         + descr.z * descr.z
-         + descr.w * descr.w;
-    norm += __shfl_down( norm, 16 );
-    norm += __shfl_down( norm,  8 );
-    norm += __shfl_down( norm,  4 );
-    norm += __shfl_down( norm,  2 );
-    norm += __shfl_down( norm,  1 );
     if( threadIdx.x == 0 ) {
-        norm = __fsqrt_rn( norm );
+        // compute 1 / sqrt(sum) in round-to-nearest even mode in thread 0
+        norm = __frsqrt_rn( norm );
     }
-    norm = __shfl( norm,  0 );
 
-    descr.x = min( descr.x, 0.2f*norm );
-    descr.y = min( descr.y, 0.2f*norm );
-    descr.z = min( descr.z, 0.2f*norm );
-    descr.w = min( descr.w, 0.2f*norm );
+    // spread the inverted norm from thread 0 to all threads in the warp
+    norm = popsift::shuffle( norm,  0 );
+
+    // quasi-normalize all 128 floats
+    descr.x = min( descr.x*norm, 0.2f );
+    descr.y = min( descr.y*norm, 0.2f );
+    descr.z = min( descr.z*norm, 0.2f );
+    descr.w = min( descr.w*norm, 0.2f );
 
+    // Repeat the procedure, but also add a multiplier. E.g., if the user wants to
+    // descriptors as bytes rather than floats, multiply by 256 - or even by 512
+    // for better accuracy, which is OK because a point cannot be a keypoint if more
+    // than half of its gradient is in a single direction.
     norm = descr.x * descr.x
          + descr.y * descr.y
          + descr.z * descr.z
          + descr.w * descr.w;
-    norm += __shfl_down( norm, 16 );
-    norm += __shfl_down( norm,  8 );
-    norm += __shfl_down( norm,  4 );
-    norm += __shfl_down( norm,  2 );
-    norm += __shfl_down( norm,  1 );
+    norm += popsift::shuffle_down( norm, 16 );
+    norm += popsift::shuffle_down( norm,  8 );
+    norm += popsift::shuffle_down( norm,  4 );
+    norm += popsift::shuffle_down( norm,  2 );
+    norm += popsift::shuffle_down( norm,  1 );
+
     if( threadIdx.x == 0 ) {
-        // norm = __fsqrt_rn( norm );
-        // norm = __fdividef( 512.0f, norm );
         norm = __frsqrt_rn( norm ); // inverse square root
         norm = scalbnf( norm, d_consts.norm_multi );
     }
-#endif
-    norm = __shfl( norm,  0 );
+
+    norm = popsift::shuffle( norm,  0 );
 
     descr.x = descr.x * norm;
     descr.y = descr.y * norm;
     descr.z = descr.z * norm;
     descr.w = descr.w * norm;
 
-    if( not ignoreme ) {
+    if( ! ignoreme ) {
         float4* out4 = (float4*)dst_desc;
         out4[threadIdx.x] = descr;
     }
diff --git a/src/popsift/s_desc_norm_rs.h b/src/popsift/s_desc_norm_rs.h
index ef5d8fa4..3ab5b1fc 100644
--- a/src/popsift/s_desc_norm_rs.h
+++ b/src/popsift/s_desc_norm_rs.h
@@ -6,6 +6,7 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 #pragma once
+#include "common/assist.h"
 #include "s_desc_normalize.h"
 
 using namespace popsift;
@@ -15,20 +16,17 @@ class NormalizeRootSift
 {
 public:
     __device__ static inline
-    void normalize( float* features, const bool ignoreme );
+    void normalize( float* features, bool ignoreme );
 
     __device__ static inline
     void normalize_restrict( const float* __restrict__ src_desc,
                              float* __restrict__       dest_desc );
 
-    __device__ static inline
-    void normalize( const float* src_desc,
-                    float*       dest_desc,
-                    const  bool  ignoreme );
+    __device__ static inline void normalize(const float* src_desc, float* dest_desc, bool ignoreme);
 };
 
 __device__ inline
-void NormalizeRootSift::normalize( float* features, const bool ignoreme )
+void NormalizeRootSift::normalize( float* features, bool ignoreme )
 {
     normalize( features, features, ignoreme );
 }
@@ -41,7 +39,7 @@ void NormalizeRootSift::normalize_restrict( const float* __restrict__ src_desc,
 }
 
 __device__ inline
-void NormalizeRootSift::normalize( const float* src_desc, float* dst_desc, const bool ignoreme )
+void NormalizeRootSift::normalize( const float* src_desc, float* dst_desc, bool ignoreme )
 {
     const float4* ptr4 = (const float4*)src_desc;
 
@@ -50,13 +48,13 @@ void NormalizeRootSift::normalize( const float* src_desc, float* dst_desc, const
 
     float sum = descr.x + descr.y + descr.z + descr.w;
 
-    sum += __shfl_down( sum, 16 );
-    sum += __shfl_down( sum,  8 );
-    sum += __shfl_down( sum,  4 );
-    sum += __shfl_down( sum,  2 );
-    sum += __shfl_down( sum,  1 );
+    sum += popsift::shuffle_down( sum, 16 );
+    sum += popsift::shuffle_down( sum,  8 );
+    sum += popsift::shuffle_down( sum,  4 );
+    sum += popsift::shuffle_down( sum,  2 );
+    sum += popsift::shuffle_down( sum,  1 );
 
-    sum = __shfl( sum,  0 );
+    sum = popsift::shuffle( sum,  0 );
 
     float val;
     val = scalbnf( __fsqrt_rn( __fdividef( descr.x, sum ) ),
@@ -72,7 +70,7 @@ void NormalizeRootSift::normalize( const float* src_desc, float* dst_desc, const
                    d_consts.norm_multi );
     descr.w = val;
 
-    if( not ignoreme ) {
+    if( ! ignoreme ) {
         float4* out4 = (float4*)dst_desc;
         out4[threadIdx.x] = descr;
     }
diff --git a/src/popsift/s_desc_normalize.h b/src/popsift/s_desc_normalize.h
index 6f2dea2a..a87d0710 100644
--- a/src/popsift/s_desc_normalize.h
+++ b/src/popsift/s_desc_normalize.h
@@ -7,9 +7,9 @@
  */
 #pragma once
 
-#include "sift_extremum.h"
 #include "s_desc_norm_l2.h"
 #include "s_desc_norm_rs.h"
+#include "sift_extremum.h"
 
 template<class T>
 __global__
diff --git a/src/popsift/s_desc_notile.cu b/src/popsift/s_desc_notile.cu
index 689a033c..9ba8a927 100644
--- a/src/popsift/s_desc_notile.cu
+++ b/src/popsift/s_desc_notile.cu
@@ -5,15 +5,15 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
-#include <stdio.h>
-#include <iso646.h>
-#include <iostream>
-
-#include "sift_constants.h"
-#include "s_gradiant.h"
-#include "s_desc_notile.h"
 #include "common/assist.h"
 #include "common/vec_macros.h"
+#include "s_desc_notile.h"
+#include "s_gradiant.h"
+#include "sift_constants.h"
+
+#include <cstdio>
+#include <iostream>
+#include <stdexcept>
 
 //   1    -> 19.6 on 980 Ti
 //   2    -> 19.5 on 980 Ti
@@ -25,6 +25,7 @@
 
 using namespace popsift;
 
+__device__
 static const float stepbase =  - 2.5f + 1.0f / 16.0f;
 
 __device__ static inline
@@ -77,12 +78,12 @@ void ext_desc_notile_sub( const float x, const float y, const int level,
         }
     }
 
-    for( int i=0; i<8; i++ )
+    for( int i=0; i<8; ++i)
     {
-        dpt[i] += __shfl_down( dpt[i], 4, 8 ); // add n+4
-        dpt[i] += __shfl_down( dpt[i], 2, 8 ); // add n+2
-        dpt[i] += __shfl_down( dpt[i], 1, 8 ); // add n+1
-        dpt[i]  = __shfl     ( dpt[i], 0, 8 ); // move 0 to all
+        dpt[i] += popsift::shuffle_down( dpt[i], 4, 8 ); // add n+4
+        dpt[i] += popsift::shuffle_down( dpt[i], 2, 8 ); // add n+2
+        dpt[i] += popsift::shuffle_down( dpt[i], 1, 8 ); // add n+1
+        dpt[i]  = popsift::shuffle     ( dpt[i], 0, 8 ); // move 0 to all
     }
 
     __syncthreads();
@@ -130,7 +131,7 @@ void ext_desc_notile( const int           octave,
 namespace popsift
 {
 
-bool start_ext_desc_notile( const int octave, Octave& oct_obj )
+bool start_ext_desc_notile( int octave, Octave& oct_obj )
 {
     dim3 block;
     dim3 grid;
@@ -151,11 +152,7 @@ bool start_ext_desc_notile( const int octave, Octave& oct_obj )
           oct_obj.getDataTexLinear( ).tex );
     cudaDeviceSynchronize();
     cudaError_t err = cudaGetLastError( );
-    if( err != cudaSuccess ) {
-        std::cerr << __FILE__ << ":" << __LINE__ << std::endl
-                  << "    cudaGetLastError failed: " << cudaGetErrorString(err) << std::endl;
-        exit( -__LINE__ );
-    }
+    POP_CUDA_FATAL_TEST(err, "cudaGetLastError failed: ");
 
     POP_SYNC_CHK;
 
diff --git a/src/popsift/s_desc_notile.h b/src/popsift/s_desc_notile.h
index 929f06c3..93a91cc3 100644
--- a/src/popsift/s_desc_notile.h
+++ b/src/popsift/s_desc_notile.h
@@ -6,15 +6,14 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 #pragma once
-#include "sift_pyramid.h"
-#include "sift_octave.h"
-#include "sift_extremum.h"
 #include "common/debug_macros.h"
-
+#include "sift_extremum.h"
+#include "sift_octave.h"
+#include "sift_pyramid.h"
 
 namespace popsift
 {
 
-bool start_ext_desc_notile( const int octave, Octave& oct_obj );
+bool start_ext_desc_notile( int octave, Octave& oct_obj );
 
 }; // namespace popsift
diff --git a/src/popsift/s_extrema.cu b/src/popsift/s_extrema.cu
index 7d9e0d0c..5c1acc44 100644
--- a/src/popsift/s_extrema.cu
+++ b/src/popsift/s_extrema.cu
@@ -5,25 +5,25 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
+#include "common/assist.h"
+#include "common/clamp.h"
+#include "common/debug_macros.h"
+#include "s_solve.h"
+#include "sift_constants.h"
+#include "sift_pyramid.h"
+
 #include <cuda_runtime.h>
 #include <texture_fetch_functions.h>
-#include <stdio.h>
-#include <iso646.h>
 
-#include "sift_pyramid.h"
-#include "sift_constants.h"
-#include "s_solve.h"
-#include "common/debug_macros.h"
-#include "common/assist.h"
-#include "common/clamp.h"
+#include <cstdio>
 
 namespace popsift{
 
 template<int HEIGHT>
 __device__ static inline
-uint32_t extrema_count( int indicator, int* extrema_counter )
+uint32_t extrema_count( unsigned int indicator, int* extrema_counter )
 {
-    uint32_t mask = __ballot( indicator ); // bitfield of warps with results
+    uint32_t mask = popsift::ballot( indicator ); // bitfield of warps with results
 
     int ct = __popc( mask );          // horizontal reduce
 
@@ -34,7 +34,7 @@ uint32_t extrema_count( int indicator, int* extrema_counter )
         write_index = atomicAdd( extrema_counter, ct );
     }
     // broadcast from thread 0 to all threads in warp
-    write_index = __shfl( write_index, 0 );
+    write_index = popsift::shuffle( write_index, 0 );
 
     // this thread's offset: count only bits below the bit of the own
     // thread index; this provides the 0 result and every result up to ct
@@ -124,7 +124,7 @@ class ModeFunctions
 {
 public:
     inline __device__
-    bool first_contrast_ok( const float val ) const;
+    bool first_contrast_ok( float val ) const;
 
     /* refine
      * returns -1 : break loop and fail
@@ -132,14 +132,14 @@ public:
      *          1 : break loop and succeed
      */
     inline __device__
-    int refine( float3& d, int3& n, const int width, const int height, const int maxlevel, bool last_it );
+    int refine( float3& d, int3& n, int width, int height, int maxlevel, bool last_it );
 
     /*
      * returns true  : values after refine make sense
      *         false : they do not
      */
     inline __device__
-    bool verify( const float xn, const float yn, const float sn, const int width, const int height, const int maxlevel ) const;
+    bool verify( float xn, float yn, float sn, int width, int height, int maxlevel ) const;
 };
 
 template<>
@@ -147,13 +147,13 @@ class ModeFunctions<Config::OpenCV>
 {
 public:
     inline __device__
-    bool first_contrast_ok( const float val ) const
+    bool first_contrast_ok( float val ) const
     {
         return ( fabsf( val ) >= floorf( d_consts.threshold ) );
     }
 
     inline __device__
-    int refine( float3& d, int3& n, const int width, const int height, const int maxlevel, bool last_it ) const
+    int refine( float3& d, int3& n, int width, int height, int maxlevel, bool last_it ) const
     {
         // OpenCV mode is a special case because d remains unmodified.
         // Either we return 1, and n has not been modified.
@@ -187,7 +187,7 @@ public:
     }
 
     inline __device__
-    int verify( const float xn, const float yn, const float sn, const int width, const int height, const int maxlevel ) const
+    bool verify( float xn, float yn, float sn, int width, int height, int maxlevel ) const
     {
         return true;
     }
@@ -204,7 +204,7 @@ public:
     }
 
     inline __device__
-    int refine( float3& d, int3& n, const int width, const int height, const int maxlevel, bool last_it ) const
+    int refine( float3& d, int3& n, int width, int height, int maxlevel, bool last_it ) const
     {
         if( last_it ) return 0;
 
@@ -232,7 +232,7 @@ public:
     }
 
     inline __device__
-    int verify( const float xn, const float yn, const float sn, const int width, const int height, const int maxlevel ) const
+    bool verify( float xn, float yn, float sn, int width, int height, int maxlevel ) const
     {
         // reject if outside of image bounds or far outside DoG bounds
         return ( ( xn < 0.0f ||
@@ -256,7 +256,7 @@ public:
     }
 
     inline __device__
-    int refine( float3& d, int3& n, const int width, const int height, const int maxlevel, bool last_it ) const
+    int refine( float3& d, int3& n, int width, int height, int maxlevel, bool last_it ) const
     {
         if( last_it ) return 0;
 
@@ -284,7 +284,7 @@ public:
     }
 
     inline __device__
-    int verify( const float xn, const float yn, const float sn, const int width, const int height, const int maxlevel ) const
+    bool verify( float xn, float yn, float sn, int width, int height, int maxlevel ) const
     {
         // reject if outside of image bounds or far outside DoG bounds
         return ( ( xn < 0.0f ||
@@ -298,16 +298,15 @@ public:
 };
 
 template<int sift_mode>
-__device__ inline
-bool find_extrema_in_dog_sub( cudaTextureObject_t dog,
-                              int                 debug_octave,
-                              int                 width,
-                              int                 height,
-                              const uint32_t      maxlevel,
-                              const float         w_grid_divider,
-                              const float         h_grid_divider,
-                              const int           grid_width,
-                              InitialExtremum&    ec )
+__device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog,
+                                               int debug_octave,
+                                               int width,
+                                               int height,
+                                               uint32_t maxlevel,
+                                               float w_grid_divider,
+                                               float h_grid_divider,
+                                               int grid_width,
+                                               InitialExtremum& ec)
 {
     ec.xpos    = 0.0f;
     ec.ypos    = 0.0f;
@@ -342,9 +341,9 @@ bool find_extrema_in_dog_sub( cudaTextureObject_t dog,
     const float val = readTex( dog, x, y, level );
 
     ModeFunctions<sift_mode> f;
-    if( not f.first_contrast_ok( val ) ) return false;
+    if( ! f.first_contrast_ok( val ) ) return false;
 
-    if( not is_extremum( dog, x-1, y-1, level-1 ) ) {
+    if( ! is_extremum( dog, x-1, y-1, level-1 ) ) {
         // if( debug_octave==0 && level==2 && x==14 && y==73 ) printf("But I fail\n");
         return false;
     }
@@ -423,7 +422,7 @@ bool find_extrema_in_dog_sub( cudaTextureObject_t dog,
         b.y = -D.y;
         b.z = -D.z;
 
-        if( solve( A, b ) == false ) {
+        if(!solve(A, b)) {
             d.x = 0;
             d.y = 0;
             d.z = 0;
@@ -463,7 +462,7 @@ bool find_extrema_in_dog_sub( cudaTextureObject_t dog,
     const float yn      = n.y + d.y;
     const float sn      = n.z + d.z;
 
-    if( not f.verify( xn, yn, sn, width, height, maxlevel ) ) {
+    if( ! f.verify( xn, yn, sn, width, height, maxlevel ) ) {
         return false;
     }
 
@@ -506,9 +505,6 @@ bool find_extrema_in_dog_sub( cudaTextureObject_t dog,
 
 template<int HEIGHT, int sift_mode>
 __global__
-#ifdef NDEBUG
-__launch_bounds__(128,16)
-#endif
 void find_extrema_in_dog( cudaTextureObject_t dog,
                           int                 octave,
                           int                 width,
diff --git a/src/popsift/s_filtergrid.cu b/src/popsift/s_filtergrid.cu
index 37238252..bfe2e64e 100644
--- a/src/popsift/s_filtergrid.cu
+++ b/src/popsift/s_filtergrid.cu
@@ -5,27 +5,30 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
-#include "sift_pyramid.h"
+#include "sift_config.h"
 #include "sift_extremum.h"
+#include "sift_pyramid.h"
 
-#ifdef USE_NVTX
-#include <nvToolsExtCuda.h>
+#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
+#include <nvtx3/nvToolsExtCuda.h>
 #else
 #define nvtxRangePushA(a)
 #define nvtxRangePop()
 #endif
 
-using namespace std;
-
-#if (__CUDACC_VER__ >= 80000) && not defined(DISABLE_GRID_FILTER)
+#if ! POPSIFT_IS_DEFINED(POPSIFT_DISABLE_GRID_FILTER)
 
+#include <thrust/copy.h>
+#include <thrust/count.h>
 #include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
 #include <thrust/sequence.h>
-#include <thrust/copy.h>
+#include <thrust/sort.h>
 #include <thrust/transform.h>
 #include <thrust/transform_scan.h>
-#include <thrust/sort.h>
-#include <thrust/execution_policy.h>
 
 namespace popsift
 {
@@ -67,22 +70,10 @@ struct FunctionExtractCell
     }
 };
 
-struct FunctionReversePosition
-{
-    const int _total;
-    FunctionReversePosition( int total ) : _total(total) { }
-
-    __host__ __device__
-    inline int operator()(int val) const
-    {
-        return _total - val - 1;
-    }
-};
-
 struct FunctionIsAbove
 {
     int _limit;
-    FunctionIsAbove( int limit ) : _limit(limit) { }
+    explicit FunctionIsAbove( int limit ) : _limit(limit) { }
 
     __host__ __device__
     inline bool operator()( int val ) const
@@ -239,12 +230,14 @@ int Pyramid::extrema_filter_grid( const Config& conf, int ext_total )
     // inclusive prefix sum
     thrust::inclusive_scan( h_cell_counts.begin(), h_cell_counts.end(), cell_count_prefix_sums.begin() );
 
-    FunctionReversePosition fun_reverse_pos( n );
+    thrust::host_vector<int> h_reverse_index(n);
+    thrust::sequence( h_reverse_index.begin(), h_reverse_index.end(),
+                      n-1,
+                      -1 );
 
     // sumup[i] = prefix sum[i] + sum( cell[i] copied into remaining cells )
     thrust::transform( h_cell_counts.begin(), h_cell_counts.end(),
-                       thrust::make_transform_iterator( thrust::make_counting_iterator<int>(0),
-                                                        fun_reverse_pos ),
+                       h_reverse_index.begin(),
                        cell_count_sumup.begin(),
                        thrust::multiplies<int>() );
     thrust::transform( cell_count_sumup.begin(), cell_count_sumup.end(),
@@ -332,7 +325,7 @@ int Pyramid::extrema_filter_grid( const Config& conf, int ext_total )
 }
 }; // namespace popsift
 
-#else // (__CUDACC_VER__ >= 80000) && not defined(DISABLE_GRID_FILTER)
+#else // not defined(DISABLE_GRID_FILTER)
 
 namespace popsift
 {
@@ -344,5 +337,5 @@ int Pyramid::extrema_filter_grid( const Config& conf, int ext_total )
 }
 }; // namespace popsift
 
-#endif // (__CUDACC_VER__ >= 80000) && not defined(DISABLE_GRID_FILTER)
+#endif // not defined(DISABLE_GRID_FILTER)
 
diff --git a/src/popsift/s_gradiant.h b/src/popsift/s_gradiant.h
index adc912d2..aaec9e2d 100644
--- a/src/popsift/s_gradiant.h
+++ b/src/popsift/s_gradiant.h
@@ -7,13 +7,13 @@
  */
 #pragma once
 
-#include <stdio.h>
-#include <inttypes.h>
-
-#include "common/plane_2d.h"
 #include "common/assist.h"
+#include "common/plane_2d.h"
 #include "sift_constants.h"
 
+#include <cinttypes>
+#include <cstdio>
+
 namespace popsift
 {
 /*
diff --git a/src/popsift/s_image.cu b/src/popsift/s_image.cu
index 8970b5cf..a966dd39 100755
--- a/src/popsift/s_image.cu
+++ b/src/popsift/s_image.cu
@@ -5,16 +5,18 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
+#include "common/assist.h"
+#include "common/debug_macros.h"
 #include "s_image.h"
-#include <iostream>
+#include "sift_config.h"
+
+#include <cassert>
+#include <cstdio>
 #include <fstream>
-#include "common/debug_macros.h"
-#include "common/assist.h"
-#include <stdio.h>
-#include <assert.h>
+#include <iostream>
 
-#ifdef USE_NVTX
-#include <nvToolsExtCuda.h>
+#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
+#include <nvtx3/nvToolsExtCuda.h>
 #else
 #define nvtxRangePushA(a)
 #define nvtxRangePop()
@@ -40,10 +42,6 @@ ImageBase::ImageBase( int w, int h )
 {
 }
 
-ImageBase::~ImageBase( )
-{
-}
-
 /*************************************************************
  * Image
  *************************************************************/
@@ -74,7 +72,7 @@ void Image::load( void* input )
      * is in CUDA-allocated pinned host memory, which makes the H2D copy
      * much faster.
      */
-    memcpy( _input_image_h.data, input, _w*_h );
+    memcpy( _input_image_h.data, input, _w*_h ); // assume that host Plane2D has no pitch
     _input_image_h.memcpyToDevice( _input_image_d );
 }
 
@@ -94,8 +92,8 @@ void Image::resetDimensions( int w, int h )
     _h = h;
 
     if( w <= _max_w && h <= _max_h ) {
-        _input_image_h.resetDimensions( w, h );
-        _input_image_d.resetDimensions( w, h );
+        _input_image_h.resetDimensionsHost( w, h );
+        _input_image_d.resetDimensionsDev( w, h );
 
         destroyTexture( );
         createTexture( );
@@ -108,8 +106,8 @@ void Image::resetDimensions( int w, int h )
         _input_image_d.freeDev( );
         _input_image_h.allocHost( _max_w, _max_h, popsift::CudaAllocated );
         _input_image_d.allocDev(  _max_w, _max_h );
-        _input_image_h.resetDimensions( w, h );
-        _input_image_d.resetDimensions( w, h );
+        _input_image_h.resetDimensionsHost( w, h );
+        _input_image_d.resetDimensionsDev( w, h );
 
         destroyTexture( );
         createTexture( );
@@ -159,7 +157,7 @@ void Image::createTexture( )
     _input_image_resDesc.res.pitch2D.desc.z       = 0;
     _input_image_resDesc.res.pitch2D.desc.w       = 0;
     assert( _input_image_d.elemSize() == 1 );
-    _input_image_resDesc.res.pitch2D.pitchInBytes = _input_image_d.step;
+    _input_image_resDesc.res.pitch2D.pitchInBytes = _input_image_d.getPitchInBytes();
     _input_image_resDesc.res.pitch2D.width        = _input_image_d.getCols();
     _input_image_resDesc.res.pitch2D.height       = _input_image_d.getRows();
 
@@ -198,7 +196,7 @@ void ImageFloat::load( void* input )
      * is in CUDA-allocated pinned host memory, which makes the H2D copy
      * much faster.
      */
-    memcpy( _input_image_h.data, input, _w*_h*sizeof(float) );
+    memcpy( _input_image_h.data, input, _w*_h*sizeof(float) ); // assume that host Plane2D has no pitch
     _input_image_h.memcpyToDevice( _input_image_d );
 }
 
@@ -218,8 +216,8 @@ void ImageFloat::resetDimensions( int w, int h )
     _h = h;
 
     if( w <= _max_w && h <= _max_h ) {
-        _input_image_h.resetDimensions( w, h );
-        _input_image_d.resetDimensions( w, h );
+        _input_image_h.resetDimensionsHost( w, h );
+        _input_image_d.resetDimensionsDev( w, h );
 
         destroyTexture( );
         createTexture( );
@@ -232,8 +230,8 @@ void ImageFloat::resetDimensions( int w, int h )
         _input_image_d.freeDev( );
         _input_image_h.allocHost( _max_w, _max_h, popsift::CudaAllocated );
         _input_image_d.allocDev(  _max_w, _max_h );
-        _input_image_h.resetDimensions( w, h );
-        _input_image_d.resetDimensions( w, h );
+        _input_image_h.resetDimensionsHost( w, h );
+        _input_image_d.resetDimensionsDev( w, h );
 
         destroyTexture( );
         createTexture( );
@@ -283,7 +281,7 @@ void ImageFloat::createTexture( )
     _input_image_resDesc.res.pitch2D.desc.z       = 0;
     _input_image_resDesc.res.pitch2D.desc.w       = 0;
     assert( _input_image_d.elemSize() == 4 );
-    _input_image_resDesc.res.pitch2D.pitchInBytes = _input_image_d.step; /* the step in Plane2D is in bytes */
+    _input_image_resDesc.res.pitch2D.pitchInBytes = _input_image_d.getPitchInBytes();
     _input_image_resDesc.res.pitch2D.width        = _input_image_d.getCols();
     _input_image_resDesc.res.pitch2D.height       = _input_image_d.getRows();
 
diff --git a/src/popsift/s_image.h b/src/popsift/s_image.h
index dec15a29..64d8c576 100755
--- a/src/popsift/s_image.h
+++ b/src/popsift/s_image.h
@@ -7,10 +7,11 @@
  */
 #pragma once
 
-#include <stdint.h>
 #include "common/plane_2d.h"
 #include "sift_conf.h"
 
+#include <cstdint>
+
 namespace popsift {
 
 /*************************************************************
@@ -24,9 +25,9 @@ struct ImageBase
     /** Create a device-sided buffer of the given dimensions */
     ImageBase( int w, int h );
 
-    virtual ~ImageBase( );
+    virtual ~ImageBase( ) = default;
 
-    /** Reallocation that takes care of pitch/step when new dimensions
+    /** Reallocation that takes care of pitch when new dimensions
      *  are smaller and actually reallocation when they are bigger.
      */
     virtual void resetDimensions( int w, int h ) = 0;
@@ -58,9 +59,9 @@ struct ImageBase
     int _max_h; // allocated height of image
 
     /* Texture information for input image on device */
-    cudaTextureObject_t _input_image_tex;
-    cudaTextureDesc     _input_image_texDesc;
-    cudaResourceDesc    _input_image_resDesc;
+    cudaTextureObject_t _input_image_tex{};
+    cudaTextureDesc     _input_image_texDesc{};
+    cudaResourceDesc    _input_image_resDesc{};
 };
 
 /*************************************************************
@@ -74,12 +75,12 @@ struct Image : public ImageBase
     /** Create a device-sided buffer of the given dimensions */
     Image( int w, int h );
 
-    virtual ~Image( );
+    ~Image( ) override;
 
-    /** Reallocation that takes care of pitch/step when new dimensions
+    /** Reallocation that takes care of pitch when new dimensions
      *  are smaller and actually reallocation when they are bigger.
      */
-    virtual void resetDimensions( int w, int h );
+    void resetDimensions( int w, int h ) override;
 
     /* This loading function copies all image data to a local
      * buffer that is pinned in memory. We should offer two
@@ -87,12 +88,12 @@ struct Image : public ImageBase
      * if the image is already uploaded, and one that takes
      * an image in pinned memory.
      */
-    virtual void load( void* input );
+    void load( void* input ) override;
 
 private:
-    void allocate( int w, int h );
-    void createTexture( );
-    void destroyTexture( );
+    void allocate( int w, int h ) override;
+    void createTexture( ) override;
+    void destroyTexture( ) override;
 
 private:
     /* 2D plane holding input image on host for uploading
@@ -114,12 +115,12 @@ struct ImageFloat : public ImageBase
     /** Create a device-sided buffer of the given dimensions */
     ImageFloat( int w, int h );
 
-    virtual ~ImageFloat( );
+    ~ImageFloat( ) override;
 
-    /** Reallocation that takes care of pitch/step when new dimensions
+    /** Reallocation that takes care of pitch when new dimensions
      *  are smaller and actually reallocation when they are bigger.
      */
-    virtual void resetDimensions( int w, int h );
+    void resetDimensions( int w, int h ) override;
 
     /* This loading function copies all image data to a local
      * buffer that is pinned in memory. We should offer two
@@ -127,12 +128,12 @@ struct ImageFloat : public ImageBase
      * if the image is already uploaded, and one that takes
      * an image in pinned memory.
      */
-    virtual void load( void* input );
+    void load( void* input ) override;
 
 private:
-    void allocate( int w, int h );
-    void createTexture( );
-    void destroyTexture( );
+    void allocate( int w, int h ) override;
+    void createTexture( ) override;
+    void destroyTexture( ) override;
 
 private:
     /* 2D plane holding input image on host for uploading
diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu
index 5929a537..b34aaaa1 100644
--- a/src/popsift/s_orientation.cu
+++ b/src/popsift/s_orientation.cu
@@ -5,20 +5,21 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
-#include <math.h>
-#include <stdio.h>
-#include <inttypes.h>
-
 #include "common/assist.h"
-#include "sift_pyramid.h"
-#include "sift_constants.h"
-#include "s_gradiant.h"
+#include "common/debug_macros.h"
 #include "common/excl_blk_prefix_sum.h"
 #include "common/warp_bitonic_sort.h"
-#include "common/debug_macros.h"
+#include "s_gradiant.h"
+#include "sift_config.h"
+#include "sift_constants.h"
+#include "sift_pyramid.h"
 
-#ifdef USE_NVTX
-#include <nvToolsExtCuda.h>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+
+#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
+#include <nvtx3/nvToolsExtCuda.h>
 #else
 #define nvtxRangePushA(a)
 #define nvtxRangePop()
@@ -51,6 +52,21 @@ inline float compute_angle( int bin, float hc, float hn, float hp )
     return th;
 }
 
+/*
+ * Histogram smoothing helper
+ */
+template<int D>
+__device__
+inline static float smoothe( const float* const src, const int bin )
+{
+    const int prev = (bin == 0) ? ORI_NBINS-1 : bin-1;
+    const int next = (bin == ORI_NBINS-1) ? 0 : bin+1;
+
+    const float f  = ( src[prev] + src[bin] + src[next] ) / 3.0f;
+
+    return f;
+}
+
 /*
  * Compute the keypoint orientations for each extremum
  * using 16 threads for each of them.
@@ -65,15 +81,18 @@ void ori_par( const int           octave,
 {
     const int extremum_index  = blockIdx.x * blockDim.y;
 
-    if( extremum_index >= dct.ext_ct[octave] ) return; // a few trailing warps
+    if( popsift::all( extremum_index >= dct.ext_ct[octave] ) ) return; // a few trailing warps
 
     const int              iext_off =  dobuf.i_ext_off[octave][extremum_index];
     const InitialExtremum* iext     = &dobuf.i_ext_dat[octave][iext_off];
 
-    __shared__ float hist   [ORI_NBINS];
-    __shared__ float sm_hist[ORI_NBINS];
+    __shared__ float hist         [64];
+    __shared__ float sm_hist      [64];
+    __shared__ float refined_angle[64];
+    __shared__ float yval         [64];
 
-    for( int i = threadIdx.x; i < ORI_NBINS; i += blockDim.x )  hist[i] = 0.0f;
+    hist[threadIdx.x+ 0] = 0.0f;
+    hist[threadIdx.x+32] = 0.0f;
 
     /* keypoint fractional geometry */
     const float x     = iext->xpos;
@@ -82,11 +101,11 @@ void ori_par( const int           octave,
     const float sig   = iext->sigma;
 
     /* orientation histogram radius */
-    float  sigw = ORI_WINFACTOR * sig;
-    int32_t rad  = (int)roundf((3.0f * sigw));
+    const float  sigw = ORI_WINFACTOR * sig;
+    const int32_t rad  = (int)roundf((3.0f * sigw));
 
-    float factor = __fdividef( -0.5f, (sigw * sigw) );
-    int sq_thres  = rad * rad;
+    const float factor = __fdividef( -0.5f, (sigw * sigw) );
+    const int sq_thres  = rad * rad;
 
     // int xmin = max(1,     (int)floor(x - rad));
     // int xmax = min(w - 2, (int)floor(x + rad));
@@ -101,7 +120,8 @@ void ori_par( const int           octave,
     int hy = ymax - ymin + 1;
     int loops = wx * hy;
 
-    for( int i = threadIdx.x; ::__any(i < loops); i += blockDim.x )
+    __syncthreads();
+    for( int i = threadIdx.x; popsift::any(i < loops); i += blockDim.x )
     {
         if( i < loops ) {
             int yy = i / wx + ymin;
@@ -120,7 +140,8 @@ void ori_par( const int           octave,
             float dy = yy - y;
 
             int sq_dist  = dx * dx + dy * dy;
-            if (sq_dist <= sq_thres) {
+            if (sq_dist <= sq_thres)
+            {
                 float weight = grad * expf(sq_dist * factor);
 
                 // int bidx = (int)rintf( __fdividef( ORI_NBINS * (theta + M_PI), M_PI2 ) );
@@ -129,33 +150,31 @@ void ori_par( const int           octave,
                 if( bidx > ORI_NBINS ) {
                     printf("Crashing: bin %d theta %f :-)\n", bidx, theta);
                 }
+                if( bidx < 0 ) {
+                    printf("Crashing: bin %d theta %f :-)\n", bidx, theta);
+                }
 
                 bidx = (bidx == ORI_NBINS) ? 0 : bidx;
 
                 atomicAdd( &hist[bidx], weight );
             }
         }
-        __syncthreads();
     }
+    __syncthreads();
 
 #ifdef WITH_VLFEAT_SMOOTHING
-    for( int i=0; i<3; i++ ) {
-        for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) {
-            int prev = bin == 0 ? ORI_NBINS-1 : bin-1;
-            int next = bin == ORI_NBINS-1 ? 0 : bin+1;
-            sm_hist[bin] = ( hist[prev] + hist[bin] + hist[next] ) / 3.0f;
-        }
+    for( int i=0; i<3 ; i++ )
+    {
+        sm_hist[threadIdx.x+ 0] = smoothe<0>( hist, threadIdx.x+ 0 );
+        sm_hist[threadIdx.x+32] = smoothe<1>( hist, threadIdx.x+32 );
         __syncthreads();
-        for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) {
-            int prev = bin == 0 ? ORI_NBINS-1 : bin-1;
-            int next = bin == ORI_NBINS-1 ? 0 : bin+1;
-            hist[bin] = ( sm_hist[prev] + sm_hist[bin] + sm_hist[next] ) / 3.0f;
-        }
+        hist[threadIdx.x+ 0]    = smoothe<2>( sm_hist, threadIdx.x+ 0 );
+        hist[threadIdx.x+32]    = smoothe<3>( sm_hist, threadIdx.x+32 );
         __syncthreads();
     }
-    for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) {
-        sm_hist[bin] = hist[bin];
-    }
+
+    sm_hist[threadIdx.x+ 0] = hist[threadIdx.x+ 0];
+    sm_hist[threadIdx.x+32] = hist[threadIdx.x+32];
     __syncthreads();
 #else // not WITH_VLFEAT_SMOOTHING
     for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) {
@@ -176,10 +195,8 @@ void ori_par( const int           octave,
 
     // sub-cell refinement of the histogram cell index, yielding the angle
     // not necessary to initialize, every cell is computed
-    __shared__ float refined_angle[64];
-    __shared__ float yval         [64];
 
-    for( int bin = threadIdx.x; ::__any( bin < ORI_NBINS ); bin += blockDim.x ) {
+    for( int bin = threadIdx.x; popsift::any( bin < ORI_NBINS ); bin += blockDim.x ) {
         const int prev = bin == 0 ? ORI_NBINS-1 : bin-1;
         const int next = bin == ORI_NBINS-1 ? 0 : bin+1;
 
@@ -202,6 +219,7 @@ void ori_par( const int           octave,
         refined_angle[bin] = predicate ? prev + newbin : -1;
         yval[bin]          = predicate ?  -(num*num) / (4.0f * denB) + sm_hist[prev] : -INFINITY;
     }
+    __syncthreads();
 
     int2 best_index = make_int2( threadIdx.x, threadIdx.x + 32 );
 
@@ -212,7 +230,7 @@ void ori_par( const int           octave,
     // All threads retrieve the yval of thread 0, the largest
     // of all yvals.
     const float best_val = yval[best_index.x];
-    const float yval_ref = 0.8f * __shfl( best_val, 0 );
+    const float yval_ref = 0.8f * popsift::shuffle( best_val, 0 );
     const bool  valid    = ( best_val >= yval_ref );
     bool        written  = false;
 
@@ -229,7 +247,7 @@ void ori_par( const int           octave,
         }
     }
 
-    int angles = __popc( __ballot( written ) );
+    int angles = __popc( popsift::ballot( written ) );
     if( threadIdx.x == 0 ) {
         ext->xpos    = iext->xpos;
         ext->ypos    = iext->ypos;
@@ -247,7 +265,7 @@ class ExtremaRead
     const Extremum* const _oris;
 public:
     inline __device__
-    ExtremaRead( const Extremum* const d_oris ) : _oris( d_oris ) { }
+    explicit ExtremaRead( const Extremum* const d_oris ) : _oris( d_oris ) { }
 
     inline __device__
     int get( int n ) const { return _oris[n].num_ori; }
@@ -258,7 +276,7 @@ class ExtremaWrt
     Extremum* _oris;
 public:
     inline __device__
-    ExtremaWrt( Extremum* d_oris ) : _oris( d_oris ) { }
+    explicit ExtremaWrt( Extremum* d_oris ) : _oris( d_oris ) { }
 
     inline __device__
     void set( int n, int value ) { _oris[n].idx_ori = value; }
@@ -269,7 +287,7 @@ class ExtremaTot
     int& _extrema_counter;
 public:
     inline __device__
-    ExtremaTot( int& extrema_counter ) : _extrema_counter( extrema_counter ) { }
+    explicit ExtremaTot( int& extrema_counter ) : _extrema_counter( extrema_counter ) { }
 
     inline __device__
     void set( int value ) { _extrema_counter = value; }
@@ -346,15 +364,14 @@ void ori_prefix_sum( const int total_ext_ct, const int num_octaves )
 __host__
 void Pyramid::orientation( const Config& conf )
 {
-    nvtxRangePushA( "reading extrema count" );
     readDescCountersFromDevice( );
-    nvtxRangePop( );
 
-    nvtxRangePushA( "filtering grid" );
     int ext_total = 0;
-    for( int o=0; o<MAX_OCTAVES; o++ ) {
-        if( hct.ext_ct[o] > 0 ) {
-            ext_total += hct.ext_ct[o];
+    for(int o : hct.ext_ct)
+    {
+        if( o > 0 )
+        {
+            ext_total += o;
         }
     }
 
@@ -364,11 +381,8 @@ void Pyramid::orientation( const Config& conf )
     {
         ext_total = extrema_filter_grid( conf, ext_total );
     }
-    nvtxRangePop( );
 
-    nvtxRangePushA( "reallocating extrema arrays" );
     reallocExtrema( ext_total );
-    nvtxRangePop( );
 
     int ext_ct_prefix_sum = 0;
     for( int octave=0; octave<_num_octaves; octave++ ) {
@@ -397,7 +411,7 @@ void Pyramid::orientation( const Config& conf )
             grid.x  = num;
 
             ori_par
-                <<<grid,block,0,oct_str>>>
+                <<<grid,block,4*64*sizeof(float),oct_str>>>
                 ( octave,
                   hct.ext_ps[octave],
                   oct_obj.getDataTexPoint( ),
diff --git a/src/popsift/s_pyramid_build.cu b/src/popsift/s_pyramid_build.cu
index 4e28dbd2..8873ca5c 100755
--- a/src/popsift/s_pyramid_build.cu
+++ b/src/popsift/s_pyramid_build.cu
@@ -5,18 +5,18 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
-#include "sift_pyramid.h"
-#include "sift_constants.h"
-#include "gauss_filter.h"
-#include "common/debug_macros.h"
 #include "common/assist.h"
 #include "common/clamp.h"
+#include "common/debug_macros.h"
+#include "gauss_filter.h"
 #include "s_pyramid_build_aa.h"
 #include "s_pyramid_build_ai.h"
 #include "s_pyramid_build_ra.h"
+#include "sift_constants.h"
+#include "sift_pyramid.h"
 
+#include <cstdio>
 #include <iostream>
-#include <stdio.h>
 
 /* It makes no sense whatsoever to change this value */
 #define PREV_LEVEL 3
@@ -499,10 +499,8 @@ void Pyramid::build_pyramid( const Config& conf, ImageBase* base )
         } else if( conf.getScalingMode() == Config::ScaleDirect ) {
             GaussTableChoice useGauss = ( conf.getGaussMode() == Config::VLFeat_Relative ) ? Interpolated_FromPrevious
                                                                                            : NotInterpolated_FromPrevious;
-            for( int level=0; level<_levels; level++ ) {
-                const int width  = oct_obj.getWidth();
-                const int height = oct_obj.getHeight();
-
+            for( int level=0; level<_levels; level++ )
+            {
                 if( level == 0 )
                 {
                     horiz_from_input_image( conf, base, octave, stream );
diff --git a/src/popsift/s_pyramid_build_aa.cu b/src/popsift/s_pyramid_build_aa.cu
index 36720b49..c026a8b7 100755
--- a/src/popsift/s_pyramid_build_aa.cu
+++ b/src/popsift/s_pyramid_build_aa.cu
@@ -5,19 +5,16 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
+#include "common/assist.h"
+#include "gauss_filter.h"
 #include "s_pyramid_build_aa.h"
 #include "sift_constants.h"
-#include "gauss_filter.h"
-#include "common/assist.h"
 
 namespace popsift {
 namespace gauss {
 namespace absoluteSource {
 
-__global__
-void horiz( cudaTextureObject_t src_point_texture,
-            cudaSurfaceObject_t dst_data,
-            const int           dst_level )
+__global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level)
 {
     const int    src_level = dst_level - 1;
     const int    span      =  d_gauss.inc.span[dst_level];
@@ -39,11 +36,11 @@ void horiz( cudaTextureObject_t src_point_texture,
     int shiftval = 0;
     for( int offset=span-1; offset>0; offset-- ) {
         shiftval += 1;
-        const float D1 = __shfl_down( A, shiftval );
-        const float D2 = __shfl_up  ( C, span - shiftval );
+        const float D1 = popsift::shuffle_down( A, shiftval );
+        const float D2 = popsift::shuffle_up  ( C, span - shiftval );
         const float D  = threadIdx.x < (32 - shiftval) ? D1 : D2;
-        const float E1 = __shfl_up  ( B, shiftval );
-        const float E2 = __shfl_down( C, span - shiftval );
+        const float E1 = popsift::shuffle_up  ( B, shiftval );
+        const float E2 = popsift::shuffle_down( C, span - shiftval );
         const float E  = threadIdx.x > shiftval        ? E1 : E2;
         g = filter[offset];
         out += ( D + E ) * g;
@@ -52,10 +49,7 @@ void horiz( cudaTextureObject_t src_point_texture,
     surf2DLayeredwrite( out, dst_data, off_x*4, off_y, dst_level, cudaBoundaryModeZero );
 }
 
-__global__
-void vert( cudaTextureObject_t src_point_texture,
-           cudaSurfaceObject_t dst_data,
-           const int           dst_level )
+__global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level)
 {
     const int    span   =  d_gauss.inc.span[dst_level];
     const float* filter = &d_gauss.inc.filter[dst_level*GAUSS_ALIGN];
@@ -91,10 +85,7 @@ void vert( cudaTextureObject_t src_point_texture,
     surf2DLayeredwrite( out, dst_data, idx*4, idy, dst_level, cudaBoundaryModeZero );
 }
 
-__global__
-void vert_abs0( cudaTextureObject_t src_point_texture,
-           cudaSurfaceObject_t dst_data,
-           const int           dst_level )
+__global__ void vert_abs0(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level)
 {
     const int    span   =  d_gauss.abs_o0.span[dst_level];
     const float* filter = &d_gauss.abs_o0.filter[dst_level*GAUSS_ALIGN];
@@ -130,11 +121,10 @@ void vert_abs0( cudaTextureObject_t src_point_texture,
     surf2DLayeredwrite( out, dst_data, idx*4, idy, dst_level, cudaBoundaryModeZero );
 }
 
-__global__
-void vert_all_abs0( cudaTextureObject_t src_point_texture,
-                    cudaSurfaceObject_t dst_data,
-                    const int           start_level,
-                    const int           max_level )
+__global__ void vert_all_abs0(cudaTextureObject_t src_point_texture,
+                              cudaSurfaceObject_t dst_data,
+                              int start_level,
+                              int max_level)
 {
     const int block_x = blockIdx.x * blockDim.x;
     const int block_y = blockIdx.y * blockDim.y;
diff --git a/src/popsift/s_pyramid_build_aa.h b/src/popsift/s_pyramid_build_aa.h
index 5a0baf19..4d3423cf 100755
--- a/src/popsift/s_pyramid_build_aa.h
+++ b/src/popsift/s_pyramid_build_aa.h
@@ -11,26 +11,16 @@ namespace popsift {
 namespace gauss {
 namespace absoluteSource {
 
-__global__
-void horiz( cudaTextureObject_t src_point_texture,
-            cudaSurfaceObject_t dst_data,
-            const int           dst_level );
+__global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level);
 
-__global__
-void vert( cudaTextureObject_t src_point_texture,
-           cudaSurfaceObject_t dst_data,
-           const int           dst_level );
+__global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level);
 
-__global__
-void vert_abs0( cudaTextureObject_t src_point_texture,
-           cudaSurfaceObject_t dst_data,
-           const int           dst_level );
+__global__ void vert_abs0(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level);
 
-__global__
-void vert_all_abs0( cudaTextureObject_t src_point_texture,
-                    cudaSurfaceObject_t dst_data,
-                    const int           start_level,
-                    const int           max_level );
+__global__ void vert_all_abs0(cudaTextureObject_t src_point_texture,
+                              cudaSurfaceObject_t dst_data,
+                              int start_level,
+                              int max_level);
 
 } // namespace absoluteSource
 } // namespace gauss
diff --git a/src/popsift/s_pyramid_build_ai.cu b/src/popsift/s_pyramid_build_ai.cu
index ff379d48..c16d636e 100755
--- a/src/popsift/s_pyramid_build_ai.cu
+++ b/src/popsift/s_pyramid_build_ai.cu
@@ -5,19 +5,16 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
+#include "common/assist.h"
+#include "gauss_filter.h"
 #include "s_pyramid_build_aa.h"
 #include "sift_constants.h"
-#include "gauss_filter.h"
-#include "common/assist.h"
 
 namespace popsift {
 namespace gauss {
 namespace absoluteSourceInterpolated {
 
-__global__
-void horiz( cudaTextureObject_t src_linear_tex,
-            cudaSurfaceObject_t dst_data,
-            const int           dst_level )
+__global__ void horiz(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level)
 {
     const int    src_level = dst_level - 1;
     const int    span      =  d_gauss.inc.i_span[dst_level];
@@ -43,10 +40,7 @@ void horiz( cudaTextureObject_t src_linear_tex,
     surf2DLayeredwrite( out, dst_data, off_x*4, blockIdx.y, dst_level, cudaBoundaryModeZero );
 }
 
-__global__
-void vert( cudaTextureObject_t src_linear_tex,
-           cudaSurfaceObject_t dst_data,
-           const int           dst_level )
+__global__ void vert(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level)
 {
     const int    span   =  d_gauss.inc.i_span[dst_level];
     const float* filter = &d_gauss.inc.i_filter[dst_level*GAUSS_ALIGN];
@@ -74,10 +68,7 @@ void vert( cudaTextureObject_t src_linear_tex,
     surf2DLayeredwrite( out, dst_data, (block_x+idx)*4, block_y+idy, dst_level, cudaBoundaryModeZero );
 }
 
-__global__
-void vert_abs0( cudaTextureObject_t src_linear_tex,
-                cudaSurfaceObject_t dst_data,
-                const int           dst_level )
+__global__ void vert_abs0(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level)
 {
     const int    span   =  d_gauss.abs_o0.i_span[dst_level];
     const float* filter = &d_gauss.abs_o0.i_filter[dst_level*GAUSS_ALIGN];
@@ -105,11 +96,10 @@ void vert_abs0( cudaTextureObject_t src_linear_tex,
     surf2DLayeredwrite( out, dst_data, (block_x+idx)*4, block_y+idy, dst_level, cudaBoundaryModeZero );
 }
 
-__global__
-void vert_all_abs0( cudaTextureObject_t src_linear_tex,
-                    cudaSurfaceObject_t dst_data,
-                    const int           start_level,
-                    const int           max_level )
+__global__ void vert_all_abs0(cudaTextureObject_t src_linear_tex,
+                              cudaSurfaceObject_t dst_data,
+                              int start_level,
+                              int max_level)
 {
     const int block_x = blockIdx.x * blockDim.y;
     const int block_y = blockIdx.y * blockDim.x;
diff --git a/src/popsift/s_pyramid_build_ai.h b/src/popsift/s_pyramid_build_ai.h
index 62230f9d..d3431fe7 100755
--- a/src/popsift/s_pyramid_build_ai.h
+++ b/src/popsift/s_pyramid_build_ai.h
@@ -11,26 +11,16 @@ namespace popsift {
 namespace gauss {
 namespace absoluteSourceInterpolated {
 
-__global__
-void horiz( cudaTextureObject_t src_linear_tex,
-            cudaSurfaceObject_t dst_data,
-            const int           dst_level );
+__global__ void horiz(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level);
 
-__global__
-void vert( cudaTextureObject_t src_linear_tex,
-           cudaSurfaceObject_t dst_data,
-           const int           dst_level );
+__global__ void vert(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level);
 
-__global__
-void vert_abs0( cudaTextureObject_t src_linear_tex,
-           cudaSurfaceObject_t dst_data,
-           const int           dst_level );
+__global__ void vert_abs0(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level);
 
-__global__
-void vert_all_abs0( cudaTextureObject_t src_linear_tex,
-                    cudaSurfaceObject_t dst_data,
-                    const int           start_level,
-                    const int           max_level );
+__global__ void vert_all_abs0(cudaTextureObject_t src_linear_tex,
+                              cudaSurfaceObject_t dst_data,
+                              int start_level,
+                              int max_level);
 
 } // namespace absoluteSourceInterpolated
 } // namespace gauss
diff --git a/src/popsift/s_pyramid_build_ra.cu b/src/popsift/s_pyramid_build_ra.cu
index db76f218..2b32e62c 100755
--- a/src/popsift/s_pyramid_build_ra.cu
+++ b/src/popsift/s_pyramid_build_ra.cu
@@ -5,10 +5,10 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
+#include "common/assist.h"
+#include "gauss_filter.h"
 #include "s_pyramid_build_ra.h"
 #include "sift_constants.h"
-#include "gauss_filter.h"
-#include "common/assist.h"
 
 namespace popsift {
 namespace gauss {
diff --git a/src/popsift/s_pyramid_build_ra.h b/src/popsift/s_pyramid_build_ra.h
index 7ff33519..0b628bc4 100755
--- a/src/popsift/s_pyramid_build_ra.h
+++ b/src/popsift/s_pyramid_build_ra.h
@@ -11,30 +11,23 @@ namespace popsift {
 namespace gauss {
 namespace normalizedSource {
 
-__global__
-void horiz( cudaTextureObject_t src_data,
-            cudaSurfaceObject_t dst_data,
-            const int           dst_w,
-            const int           dst_h,
-            int                 octave,
-            float               shift );
+__global__ void horiz(cudaTextureObject_t src_data,
+                      cudaSurfaceObject_t dst_data,
+                      int dst_w,
+                      int dst_h,
+                      int octave,
+                      float shift);
 
-__global__
-void horiz_level( cudaTextureObject_t src_linear_tex,
-                  cudaSurfaceObject_t dst_data,
-                  int                 dst_w,
-                  int                 dst_h,
-                  int                 /* octave - must be 0 */,
-                  int                 level,
-                  float               shift );
+__global__ void horiz_level(cudaTextureObject_t src_linear_tex,
+                            cudaSurfaceObject_t dst_data,
+                            int dst_w,
+                            int dst_h,
+                            int /* octave - must be 0 */,
+                            int level,
+                            float shift);
 
-__global__
-void horiz_all( cudaTextureObject_t src_linear_tex,
-                cudaSurfaceObject_t dst_data,
-                int                 dst_w,
-                int                 dst_h,
-                float               shift,
-                const int           max_level );
+__global__ void horiz_all(
+  cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_w, int dst_h, float shift, int max_level);
 
 } // namespace normalizedSource
 } // namespace gauss
diff --git a/src/popsift/s_pyramid_fixed.cu b/src/popsift/s_pyramid_fixed.cu
index 87816be0..9e3d52aa 100755
--- a/src/popsift/s_pyramid_fixed.cu
+++ b/src/popsift/s_pyramid_fixed.cu
@@ -13,7 +13,7 @@
 #include "common/clamp.h"
 
 #include <iostream>
-#include <stdio.h>
+#include <cstdio>
 
 namespace popsift {
 
@@ -33,11 +33,11 @@ inline float octave_fixed_horiz( float fval, const float* filter )
     float out = fval * filter[0];
     #pragma unroll
     for( int i=1; i<=SHIFT; i++ ) {
-        float val  = __shfl_up( fval, i ) + __shfl_down( fval, i );
+        float val  = popsift::shuffle_up( fval, i ) + popsift::shuffle_down( fval, i );
         out += val * filter[i];
     }
 
-    fval = __shfl_down( out, SHIFT );
+    fval = popsift::shuffle_down( out, SHIFT );
 
     return fval;
 }
diff --git a/src/popsift/s_solve.h b/src/popsift/s_solve.h
index c6edd5b9..b367f38e 100755
--- a/src/popsift/s_solve.h
+++ b/src/popsift/s_solve.h
@@ -17,7 +17,8 @@
 #undef USE_GAUSSIAN_ELIMINATION
 
 #include <cuda_runtime.h>
-#include <stdio.h>
+
+#include <cstdio>
 
 #ifndef USE_GAUSSIAN_ELIMINATION
 
diff --git a/src/popsift/sift_conf.cu b/src/popsift/sift_conf.cu
index 783eb402..251f58ff 100644
--- a/src/popsift/sift_conf.cu
+++ b/src/popsift/sift_conf.cu
@@ -5,9 +5,10 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
-#include <iostream>
-#include "sift_conf.h"
 #include "common/debug_macros.h"
+#include "sift_conf.h"
+
+#include <iostream>
 
 using namespace std;
 
@@ -121,7 +122,7 @@ const char* Config::getGaussModeUsage( )
 
 bool Config::getCanFilterExtrema() const
 {
-#if __CUDACC_VER__ >= 80000
+#if __CUDACC_VER_MAJOR__ >= 8
     return true;
 #else
     return false;
diff --git a/src/popsift/sift_conf.h b/src/popsift/sift_conf.h
index fd26ac7c..583a958c 100644
--- a/src/popsift/sift_conf.h
+++ b/src/popsift/sift_conf.h
@@ -8,21 +8,35 @@
 #pragma once
 
 #include <string>
-#include <iso646.h>
 
 #define MAX_OCTAVES   20
 #define MAX_LEVELS    10
 
 #undef USE_DOG_TEX_LINEAR
 
-namespace popsift
-{
+#ifdef _MSC_VER
+#define DEPRECATED(func) __declspec(deprecated) func
+#elif defined(__GNUC__) || defined(__clang__)
+#define DEPRECATED(func) func __attribute__ ((deprecated))
+#else
+#endif
+
+namespace popsift {
 
+/**
+ * @brief Struct containing the parameters that control the extraction algorithm
+ */
 struct Config
 {
-    Config( );
+    Config();
 
-    enum GaussMode {
+    /**
+     * @brief The way the gaussian mode is compute.
+     *
+     * Each setting allows to mimic and reproduce the behaviour of other Sift implementations.
+     */
+    enum GaussMode
+    {
         VLFeat_Compute,
         VLFeat_Relative,
         VLFeat_Relative_All,
@@ -31,68 +45,145 @@ struct Config
         Fixed15
     };
 
-    enum SiftMode {
+    /**
+     * @brief General setting to reproduce the results of other Sift implementations.
+     */
+    enum SiftMode
+    {
+        /// Popsift implementation
         PopSift,
+        /// OpenCV implementation
         OpenCV,
+        /// VLFeat implementation
         VLFeat,
+        /// Default implementation is PopSift
         Default = PopSift
     };
 
-    enum LogMode {
+    /**
+     * @brief The logging mode.
+     */
+    enum LogMode
+    {
         None,
         All
     };
 
-    enum ScalingMode {
+    /**
+     * @brief The scaling mode.
+     */
+    enum ScalingMode
+    {
         ScaleDirect,
-        ScaleDefault // Indirect - only working method
+        /// Indirect - only working method
+        ScaleDefault
     };
 
-    /* Modes for descriptor extraction: */
-    enum DescMode {
-        Loop,        // scan horizontal, extract valid points
-        ILoop,       // scan horizontal, extract valid points, interpolate with tex engine
-        Grid,        // scan in rotated mode, round pixel address
-        IGrid,       // scan in rotated mode, interpolate with tex engine
-        NoTile       // variant of IGrid, no duplicate gradiant fetching
+    /**
+     * @brief Modes for descriptor extraction.
+     */
+    enum DescMode
+    {
+        /// scan horizontal, extract valid points
+        Loop,
+        /// scan horizontal, extract valid points, interpolate with tex engine
+        ILoop,
+        /// scan in rotated mode, round pixel address
+        Grid,
+        /// scan in rotated mode, interpolate with tex engine
+        IGrid,
+        /// variant of IGrid, no duplicate gradient fetching
+        NoTile
     };
 
-    enum NormMode {
-        RootSift,   // The L1-inspired norm, gives better matching results
-        Classic     // The L2-inspired norm, all descriptors on a hypersphere
+    /**
+     * @brief Type of norm to use for matching.
+     */
+    enum NormMode
+    {
+        /// The L1-inspired norm, gives better matching results ("RootSift")
+        RootSift,
+        /// The L2-inspired norm, all descriptors on a hypersphere ("classic")
+        Classic
     };
 
-    /* To reduce time used in descriptor extraction, some extrema can be filtered
+    /**
+     * @brief Filtering strategy.
+     * 
+     * To reduce time used in descriptor extraction, some extrema can be filtered
      * immediately after finding them. It is possible to keep those with the largest
      * scale (LargestScaleFirst), smallest scale (SmallestScaleFirst), or a random
      * selection. Note that largest and smallest give a stable result, random does not.
      */
     enum GridFilterMode {
+        /// keep a random selection
         RandomScale,
+        /// keep those with the largest scale
         LargestScaleFirst,
+        /// keep those with the smallest scale
         SmallestScaleFirst
     };
 
-    /* A parameter for the PopSift constructor. Determines which data is kept in
-     * the Job data structure after processing, which is downloaded to the host,
-     * which is invalidated.
+    /**
+     * @brief Processing mode. 
+     * 
+     * Determines which data is kept in the Job data structure after processing, which one is downloaded to the host,
+     * which one is invalidated.
      */
     enum ProcessingMode {
         ExtractingMode,
         MatchingMode
     };
 
+    /**
+     * @brief Set the Gaussian mode from string.
+     * @param[in] m The string version of the GaussMode
+     * @see GaussMode
+     */
     void setGaussMode( const std::string& m );
+    /**
+     * @brief Set the Gaussian mode.
+     * @param[in] m The Gaussian mode to use.
+     */
     void setGaussMode( GaussMode m );
+
+    /**
+     * @brief Set the Sift mode.
+     * @param[in] m The Sift mode
+     * @see SiftMode
+     */
     void setMode( SiftMode m );
+
+    /**
+     * @brief Set the log mode.
+     * @param mode The log mode.
+     * @see LogMode
+     */
     void setLogMode( LogMode mode = All );
     void setScalingMode( ScalingMode mode = ScaleDefault );
+
+    /**
+     * @brief Enable/desable verbose mode.
+     * @param[in] on Whether to display additional information .
+     */
     void setVerbose( bool on = true );
+
+    /**
+     * @brief Set the descriptor mode by string.
+     * @param[in] byname The string containing the descriptor mode.
+     * @see DescMode
+     */
     void setDescMode( const std::string& byname );
+
+    /**
+    * @brief Set the descriptor mode.
+    * @param[in] mode The descriptor mode.
+    * @see DescMode
+    */
     void setDescMode( DescMode mode = Loop );
 
-    void setGaussGroup( int groupsize );
-    int  getGaussGroup( ) const;
+//    void setGaussGroup( int groupsize );
+//    int  getGaussGroup( ) const;
 
     void setDownsampling( float v );
     void setOctaves( int v );
@@ -101,10 +192,9 @@ struct Config
     void setEdgeLimit( float v );
     void setThreshold( float v );
     void setInitialBlur( float blur );
-    void setMaxExtreme( int m );
+//    void setMaxExtreme( int m );
     void setPrintGaussTables( );
-    void setDPOrientation( bool on );
-    void setMaxExtrema( int extrema );
+//    void setDPOrientation( bool on );
     void setFilterMaxExtrema( int extrema );
     void setFilterGridSize( int sz );
     void setFilterSorting( const std::string& direction );
@@ -113,64 +203,80 @@ struct Config
     bool  hasInitialBlur( ) const;
     float getInitialBlur( ) const;
 
-    // computes the actual peak threshold depending on the threshold
-    // parameter and the non-augmented number of levels
+    /// computes the actual peak threshold depending on the threshold
+    /// parameter and the non-augmented number of levels
     float getPeakThreshold() const;
 
-    // print Gauss spans and tables?
+    /// print Gauss spans and tables?
     bool ifPrintGaussTables() const;
 
-    // What Gauss filter scan is desired?
+    /// What Gauss filter scan is desired?
     GaussMode getGaussMode( ) const;
 
-    // Call this from the constructor.
+    /// Call this from the constructor.
     static GaussMode getGaussModeDefault( );
 
+
     // Helper functions for the main program's usage string.
+    /**
+     * @brief Get a message with the strings to use for setting the values of \p GaussMode
+     * @return  A message with the list of strings
+     */
     static const char* getGaussModeUsage( );
 
-    // get the SIFT mode for more detailed sub-modes
+    /**
+     * @brief Get the SIFT mode for more detailed sub-modes
+     * @return The SiftMode
+     * @see SiftMode
+     */
     SiftMode getSiftMode() const;
 
-    // find out if we should print logging info or not
+    /// find out if we should print logging info or not
     LogMode getLogMode() const;
 
-    // The number of octaves is chosen freely. If not specified,
-    // it is: log_2( min(x,y) ) - 3 - start_sampling
+    /// The number of octaves is chosen freely. If not specified,
+    /// it is: log_2( min(x,y) ) - 3 - start_sampling
     int      octaves;
 
-    // The number of levels per octave. This is actually the
-    // number of inner DoG levels where we can search for
-    // feature points. The number of ...
-    //
-    // This is the non-augmented number of levels, meaning
-    // the this is not the number of gauss-filtered picture
-    // layers (which is levels+3), but the number of DoG
-    // layers in which we can search for extrema.
+    /// The number of levels per octave. This is actually the
+    /// number of inner DoG levels where we can search for
+    /// feature points. The number of ...
+    ///
+    /// This is the non-augmented number of levels, meaning
+    /// the this is not the number of gauss-filtered picture
+    /// layers (which is levels+3), but the number of DoG
+    /// layers in which we can search for extrema.
     int      levels;
     float    sigma;
 
-    // default edge_limit 16.0f from Celebrandil
-    // default edge_limit 10.0f from Bemap
+    /// default edge_limit 16.0f from Celebrandil
+    /// default edge_limit 10.0f from Bemap
     float    _edge_limit;
 
     /** Functions related to descriptor normalization: L2-like or RootSift
      */
     void               setNormMode( NormMode m );
     void               setNormMode( const std::string& m );
-    void               setNormNode( const std::string& m );
-    void               setUseRootSift( bool on ) __attribute__ ((deprecated));
+    /**
+     * @brief Set the normalization mode.
+     * @param[in] on Use RootSift (\p true) or the L2-norm (\p false).
+     * @deprecated
+     * @see NormMode
+     */
+    DEPRECATED(void    setUseRootSift( bool on ));
     bool               getUseRootSift( ) const;
     NormMode           getNormMode( NormMode m ) const;
     static NormMode    getNormModeDefault( ); // Call this from the constructor.
     static const char* getNormModeUsage( );  // Helper functions for the main program's usage string.
 
-    /** Functions related to descriptor normalization: multiply with a power of 2
+    /**
+     * @brief Functions related to descriptor normalization: multiply with a power of 2
      */
     int  getNormalizationMultiplier( ) const;
     void setNormalizationMultiplier( int mul );
 
-    /* The input image is stretched by 2^upscale_factor
+    /**
+     * @brief The input image is stretched by 2^upscale_factor
      * before processing. The factor 1 is default.
      */
     inline float getUpscaleFactor( ) const {
@@ -181,126 +287,124 @@ struct Config
         return _max_extrema;
     }
 
-    /* Filtering extrema is only possible when CUDA version is >= 8.0
-     * The reason is that we use Thrust. This allows runtime testing.
-     *
-     * Note: re-writing the filtering code is possible, either older
-     *       Thrust semantics, CUDA CUB or doing everything from scratch.
+    /**
+     * Have we enabled filtering? This is a compile time decision.
+     * The reason is that we use Thrust, which increases compile 
+     * considerably and can be deactivated at the CMake level when
+     * you work on something else.
      */
     bool getCanFilterExtrema() const;
 
-    /* Set the approximate number of extrema whose orientation and descriptor
+    /**
+     * Set the approximate number of extrema whose orientation and descriptor
      * should be computed. Default is -1, which sets the hard limit defined
      * by "number of octaves * getMaxExtrema()".
      */
-    int getFilterMaxExtrema( ) const {
-        return _filter_max_extrema;
-    }
+    int getFilterMaxExtrema() const { return _filter_max_extrema; }
 
-    /* To avoid that grid filtering happens only in a tiny piece of an image,
+    /**
+     * @brief Get the grid size for filtering.
+     *
+     * To avoid that grid filtering happens only in a tiny piece of an image,
      * the image is split into getFilterGridSize() X getFilterGridSize() tiles
      * and we allow getFilterMaxExtrema() / getFilterGridSize() extrema in
      * each tile.
      */
-    int getFilterGridSize( ) const {
-        return _filter_grid_size;
-    }
+    int getFilterGridSize() const { return _filter_grid_size; }
 
-    /* See enum GridFilterMode */
-    GridFilterMode getFilterSorting() const {
-        return _grid_filter_mode;
-    }
+    /**
+     * @brief Get the filtering mode.
+     * @return the filtering mode.
+     * @see GridFilterMode
+     */
+    GridFilterMode getFilterSorting() const { return _grid_filter_mode; }
 
-    // check if we use direct downscaling from input image
-    // for all octaves
-    inline ScalingMode getScalingMode() const {
-        return _scaling_mode;
-    }
+    /**
+     * @brief Get the scaling mode.
+     * @return the descriptor extraction mode.
+     * @see ScalingMode
+     */
+    inline ScalingMode getScalingMode() const { return _scaling_mode; }
 
-    inline DescMode getDescMode() const {
-        return _desc_mode;
-    }
+    /**
+     * @brief Get the descriptor extraction mode
+     * @return the descriptor extraction mode
+     * @see DescMode
+     */
+    inline DescMode getDescMode() const { return _desc_mode; }
 
     bool equal( const Config& other ) const;
 
 private:
-    // default threshold 0.0 default of vlFeat
-    // default threshold 5.0 / 256.0
-    // default threshold 15.0 / 256.0 - it seems our DoG is really small ???
-    // default threshold 5.0 from Celebrandil, not happening in our data
-    // default threshold 0.04 / (_levels-3.0) / 2.0f * 255
-    //                   from Bemap -> 1.69 (makes no sense)
+    /// default threshold 0.0 default of vlFeat
+    /// default threshold 5.0 / 256.0
+    /// default threshold 15.0 / 256.0 - it seems our DoG is really small ???
+    /// default threshold 5.0 from Celebrandil, not happening in our data
+    /// default threshold 0.04 / (_levels-3.0) / 2.0f * 255
+    ///                   from Bemap -> 1.69 (makes no sense)
     float    _threshold;
 
-    // determine the image format of the first octave
-    // relative to the input image's size (x,y) as follows:
-    // (x / 2^start_sampling, y / 2^start_sampling )
+    /// determine the image format of the first octave
+    /// relative to the input image's size (x,y) as follows:
+    /// (x / 2^start_sampling, y / 2^start_sampling )
     float    _upscale_factor;
 
-    // default LogMode::None
+    /// default LogMode::None
     LogMode  _log_mode;
 
-    // default: ScalingMode::DownscaledOctaves
+    /// default: ScalingMode::DownscaledOctaves
     ScalingMode _scaling_mode;
 
-    // default: DescMode::Loop
+    /// default: DescMode::Loop
     DescMode    _desc_mode;
 
-    // default: RandomScale
+    /// default: RandomScale
     GridFilterMode _grid_filter_mode;
 
 public:
     bool     verbose;
 
 private:
-    /* The number of initial extrema that can be discovered in an octave.
-     * This parameter changes memory requirements.
-     */
+    /// The number of initial extrema that can be discovered in an octave.
+    /// This parameter changes memory requirements.
     int _max_extrema;
 
-    /* The maximum number of extrema that are returned. There may be
-     * several descriptors for each extremum.
-     */
+    /// The maximum number of extrema that are returned. There may be
+    /// several descriptors for each extremum.
     int _filter_max_extrema;
 
-    // Used to achieve an approximation of _max_entrema
-    // Subdivide the image in this number of vertical and horizontal tiles,
-    // i.e. the grid is actually _grid_size X _grid_size tiles.
-    // default: 1
+    /// Used to achieve an approximation of _max_entrema
+    /// Subdivide the image in this number of vertical and horizontal tiles,
+    /// i.e. the grid is actually _grid_size X _grid_size tiles.
+    /// default: 1
     int  _filter_grid_size;
 
-    /* Modes are computation according to VLFeat or OpenCV,
-     * or fixed size. Default is VLFeat mode.
-     */
+    /// Modes are computation according to VLFeat or OpenCV,
+    /// or fixed size. Default is VLFeat mode.
     GaussMode _gauss_mode;
 
-    /* Modes are PopSift, OpenCV and VLFeat.
-     * Default is currently identical to PopSift.
-     */
+    /// Modes are PopSift, OpenCV and VLFeat.
+    /// Default is currently identical to PopSift.
     SiftMode _sift_mode;
 
-    /* VLFeat code assumes that an initial input image is partially blurred.
-     * This changes the blur computation for the very first level of the first
-     * octave, turning it into a special case.
-     */
+    /// VLFeat code assumes that an initial input image is partially blurred.
+    /// This changes the blur computation for the very first level of the first
+    /// octave, turning it into a special case.
     bool  _assume_initial_blur;
     float _initial_blur;
 
-    /* OpenMVG requires a normalization named rootSift, the
-     * classical L2-inspired mode is also supported.
-     */
+    /// OpenMVG requires a normalization named rootSift, the
+    /// classical L2-inspired mode is also supported.
     NormMode _normalization_mode;
 
-    /* SIFT descriptors are normalized in a final step.
-     * The values of the descriptor can also be multiplied
-     * by a power of 2 if required.
-     * Specify the exponent.
-     */
+    /// SIFT descriptors are normalized in a final step.
+    /// The values of the descriptor can also be multiplied
+    /// by a power of 2 if required.
+    /// Specify the exponent.
     int _normalization_multiplier;
 
-    /* Call the debug functions in gauss_filter.cu to print Gauss
-     * filter width and Gauss tables in use.
-     */
+    /// Call the debug functions in gauss_filter.cu to print Gauss
+    /// filter width and Gauss tables in use.
     bool _print_gauss_tables;
 };
 
@@ -311,7 +415,7 @@ inline bool operator==( const Config& l, const Config& r )
 
 inline bool operator!=( const Config& l, const Config& r )
 {
-    return not l.equal( r );
+    return ! l.equal( r );
 }
 
 }; // namespace popsift
diff --git a/src/popsift/sift_constants.cu b/src/popsift/sift_constants.cu
index 11eda7fb..e8e4f356 100755
--- a/src/popsift/sift_constants.cu
+++ b/src/popsift/sift_constants.cu
@@ -5,18 +5,18 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
+#include "common/debug_macros.h"
+#include "sift_constants.h"
+
 #include <cuda_runtime.h>
 
 #include <iostream>
 
-#include "sift_constants.h"
-#include "common/debug_macros.h"
-
 using namespace std;
 
 namespace popsift {
 
-ConstInfo                         h_consts;
+thread_local            ConstInfo h_consts;
 __device__ __constant__ ConstInfo d_consts;
 
 void init_constants( float sigma0, int levels, float threshold, float edge_limit, int max_extrema, int normalization_multiplier )
diff --git a/src/popsift/sift_constants.h b/src/popsift/sift_constants.h
index 71b24ab7..883515a7 100755
--- a/src/popsift/sift_constants.h
+++ b/src/popsift/sift_constants.h
@@ -7,26 +7,35 @@
  */
 #pragma once
 
+#include <cuda_runtime.h>
+
 #ifndef INF
 #define INF               (1<<29)
 #endif
 #ifndef NINF
 #define NINF              (-INF)
 #endif
-#ifdef M_PI
-#undef M_PI
-// #define M_PI  3.14159265358979323846f
-#endif
-__device__ static const
-float M_PI = 3.14159265358979323846f;
-#ifdef M_PI2
-#undef M_PI2
-// #define M_PI2 (2.0F * M_PI)
-#endif
-__device__ static const
-float M_PI2 = 2.0f * 3.14159265358979323846f;
 
-#define M_4RPI               (4.0f / M_PI)
+#undef USE_CONSTANT_PI
+#ifdef USE_CONSTANT_PI
+  #ifdef M_PI
+    #undef M_PI
+  #endif
+  __device__ static const float M_PI = 3.14159265358979323846f;
+  #ifdef M_PI2
+    #undef M_PI2
+  #endif
+  __device__ static const float M_PI2 = 2.0f * 3.14159265358979323846f;
+  #define M_4RPI               (4.0f / M_PI)
+#else
+  #ifndef M_PI
+    #define M_PI 3.14159265358979323846f
+  #endif
+  #ifndef M_PI2
+    #define M_PI2 (2.0F * M_PI)
+  #endif
+  #define M_4RPI               (4.0f / M_PI)
+#endif
 
 #define DESC_MIN_FLOAT               1E-15F
 
@@ -66,7 +75,7 @@ struct ConstInfo
     float desc_tile[16];
 };
 
-extern                         ConstInfo h_consts;
+extern thread_local            ConstInfo h_consts;
 extern __device__ __constant__ ConstInfo d_consts;
 
 
diff --git a/src/popsift/sift_desc.cu b/src/popsift/sift_desc.cu
index a7ba09ac..f533df35 100644
--- a/src/popsift/sift_desc.cu
+++ b/src/popsift/sift_desc.cu
@@ -5,24 +5,24 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
-#include <iostream>
-#include <stdio.h>
-#include <iso646.h>
-
-#include "sift_pyramid.h"
-#include "sift_constants.h"
-#include "s_gradiant.h"
-#include "s_desc_normalize.h"
-#include "s_desc_loop.h"
-#include "s_desc_iloop.h"
+#include "common/assist.h"
+#include "common/debug_macros.h"
 #include "s_desc_grid.h"
 #include "s_desc_igrid.h"
+#include "s_desc_iloop.h"
+#include "s_desc_loop.h"
+#include "s_desc_normalize.h"
 #include "s_desc_notile.h"
-#include "common/assist.h"
-#include "common/debug_macros.h"
+#include "s_gradiant.h"
+#include "sift_config.h"
+#include "sift_constants.h"
+#include "sift_pyramid.h"
+
+#include <cstdio>
+#include <iostream>
 
-#ifdef USE_NVTX
-#include <nvToolsExtCuda.h>
+#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
+#include <nvtx3/nvToolsExtCuda.h>
 #else
 #define nvtxRangePushA(a)
 #define nvtxRangePop()
diff --git a/src/popsift/sift_extremum.h b/src/popsift/sift_extremum.h
index 087dbff8..0363a02e 100755
--- a/src/popsift/sift_extremum.h
+++ b/src/popsift/sift_extremum.h
@@ -7,14 +7,15 @@
  */
 #pragma once
 
+#include "sift_constants.h"
+
 #include <iostream>
 #include <vector>
 
-#include "sift_constants.h"
-
 namespace popsift {
 
-/* This is an internal data structure.
+/**
+ * @brief This is an internal data structure.
  * Separated from the final Extremum data structure to implement
  * grid filtering in a space-efficient manner. In grid filtering,
  * extrema are first found, after that some may be discarded in
@@ -25,14 +26,20 @@ struct InitialExtremum
 {
     float xpos;
     float ypos;
-    int   lpos;  // extremum refined into this level
-    float sigma; // scale;
-    int   cell;  // index into the grid for grid-based extrema filtering
-    bool  ignore; // true if this extremum has been filtered
-    int   write_index; // if any initial extrema are ignored, new index for Extremum
+    /// extremum refined into this level
+    int   lpos;
+    /// scale
+    float sigma;
+    /// index into the grid for grid-based extrema filtering
+    int   cell;
+    /// true if this extremum has been filtered
+    bool  ignore;
+    /// if any initial extrema are ignored, new index for Extremum
+    int   write_index;
 };
 
-/* This is an internal data structure.
+/**
+ * @brief This is an internal data structure.
  * For performance reasons, it would be appropriate to split
  * the first 4 values from the rest of this structure. Right
  * now, descriptor computation is a bigger concern.
@@ -41,16 +48,22 @@ struct Extremum
 {
     float xpos;
     float ypos;
-    int   lpos;  // extremum refined into this level
-    float sigma; // scale;
+    /// extremum refined into this level
+    int   lpos;
+    /// scale
+    float sigma;
 
-    int   octave;  // belonging to this octave
-    int   num_ori; // number of this extremum's orientations
-    int   idx_ori; // exclusive prefix sum of the layer's orientations
+    /// belonging to this octave
+    int   octave;
+    /// number of this extremum's orientations
+    int   num_ori;
+    /// exclusive prefix sum of the layer's orientations
+    int   idx_ori;
     float orientation[ORIENTATION_MAX_COUNT];
 };
 
-/* This is a data structure that is returned to a calling program.
+/**
+ * @brief This is a data structure that is returned to a calling program.
  * This is the SIFT descriptor itself.
  */
 struct Descriptor
diff --git a/src/popsift/sift_octave.cu b/src/popsift/sift_octave.cu
index 64fbc4a5..430bc298 100755
--- a/src/popsift/sift_octave.cu
+++ b/src/popsift/sift_octave.cu
@@ -5,23 +5,24 @@
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */
-#include <sstream>
+
+#include "common/clamp.h"
+#include "common/debug_macros.h"
+#include "common/write_plane_2d.h"
+#include "sift_constants.h"
+#include "sift_octave.h"
+#include "sift_pyramid.h"
+
 #include <sys/stat.h>
+
+#include <new> // for placement new
+#include <sstream>
 #ifdef _WIN32
 #include <direct.h>
 #define stat _stat
 #define mkdir(name, mode) _mkdir(name)
 #endif
 
-#include <new> // for placement new
-
-#include "sift_pyramid.h"
-#include "sift_constants.h"
-#include "common/debug_macros.h"
-#include "common/clamp.h"
-#include "common/write_plane_2d.h"
-#include "sift_octave.h"
-
 using namespace std;
 
 namespace popsift {
@@ -219,14 +220,14 @@ void Octave::alloc_data_tex()
 {
     cudaError_t err;
 
-    cudaResourceDesc res_desc;
+    cudaResourceDesc res_desc{};
     res_desc.resType = cudaResourceTypeArray;
     res_desc.res.array.array = _data;
 
     err = cudaCreateSurfaceObject(&_data_surf, &res_desc);
     POP_CUDA_FATAL_TEST(err, "Could not create Blur data surface: ");
 
-    cudaTextureDesc      tex_desc;
+    cudaTextureDesc      tex_desc{};
 
     memset(&tex_desc, 0, sizeof(cudaTextureDesc));
     tex_desc.normalizedCoords = 0; // addressed (x,y) in [width,height]
@@ -236,7 +237,7 @@ void Octave::alloc_data_tex()
     tex_desc.readMode         = cudaReadModeElementType; // read as float
     tex_desc.filterMode       = cudaFilterModePoint; // no interpolation
 
-    err = cudaCreateTextureObject( &_data_tex_point, &res_desc, &tex_desc, 0 );
+    err = cudaCreateTextureObject( &_data_tex_point, &res_desc, &tex_desc, nullptr );
     POP_CUDA_FATAL_TEST(err, "Could not create Blur data point texture: ");
 
     memset(&tex_desc, 0, sizeof(cudaTextureDesc));
@@ -247,7 +248,7 @@ void Octave::alloc_data_tex()
     tex_desc.readMode         = cudaReadModeElementType; // read as float
     tex_desc.filterMode       = cudaFilterModeLinear; // no interpolation
 
-    err = cudaCreateTextureObject( &_data_tex_linear.tex, &res_desc, &tex_desc, 0 );
+    err = cudaCreateTextureObject( &_data_tex_linear.tex, &res_desc, &tex_desc, nullptr );
     POP_CUDA_FATAL_TEST(err, "Could not create Blur data point texture: ");
 }
 
@@ -298,14 +299,14 @@ void Octave::alloc_interm_tex()
 {
     cudaError_t err;
 
-    cudaResourceDesc res_desc;
+    cudaResourceDesc res_desc{};
     res_desc.resType = cudaResourceTypeArray;
     res_desc.res.array.array = _intm;
 
     err = cudaCreateSurfaceObject(&_intm_surf, &res_desc);
     POP_CUDA_FATAL_TEST(err, "Could not create Blur intermediate surface: ");
 
-    cudaTextureDesc      tex_desc;
+    cudaTextureDesc      tex_desc{};
 
     memset(&tex_desc, 0, sizeof(cudaTextureDesc));
     tex_desc.normalizedCoords = 0; // addressed (x,y) in [width,height]
@@ -315,12 +316,12 @@ void Octave::alloc_interm_tex()
     tex_desc.readMode         = cudaReadModeElementType; // read as float
     tex_desc.filterMode       = cudaFilterModePoint; // no interpolation
 
-    err = cudaCreateTextureObject( &_intm_tex_point, &res_desc, &tex_desc, 0 );
+    err = cudaCreateTextureObject( &_intm_tex_point, &res_desc, &tex_desc, nullptr );
     POP_CUDA_FATAL_TEST(err, "Could not create Blur intermediate point texture: ");
 
     tex_desc.filterMode       = cudaFilterModeLinear; // no interpolation
 
-    err = cudaCreateTextureObject( &_intm_tex_linear.tex, &res_desc, &tex_desc, 0 );
+    err = cudaCreateTextureObject( &_intm_tex_linear.tex, &res_desc, &tex_desc, nullptr );
     POP_CUDA_FATAL_TEST(err, "Could not create Blur intermediate point texture: ");
 }
 
@@ -371,14 +372,14 @@ void Octave::alloc_dog_tex()
 {
         cudaError_t err;
 
-        cudaResourceDesc dog_res_desc;
+        cudaResourceDesc dog_res_desc{};
         dog_res_desc.resType = cudaResourceTypeArray;
         dog_res_desc.res.array.array = _dog_3d;
 
         err = cudaCreateSurfaceObject(&_dog_3d_surf, &dog_res_desc);
         POP_CUDA_FATAL_TEST(err, "Could not create DoG surface: ");
 
-        cudaTextureDesc      dog_tex_desc;
+        cudaTextureDesc      dog_tex_desc{};
         memset(&dog_tex_desc, 0, sizeof(cudaTextureDesc));
         dog_tex_desc.normalizedCoords = 0; // addressed (x,y) in [width,height]
         dog_tex_desc.addressMode[0] = cudaAddressModeClamp;
diff --git a/src/popsift/sift_octave.h b/src/popsift/sift_octave.h
index a0122bdf..fc2ad13b 100755
--- a/src/popsift/sift_octave.h
+++ b/src/popsift/sift_octave.h
@@ -7,13 +7,13 @@
  */
 #pragma once
 
-#include <iostream>
-#include <vector>
-
 #include "s_image.h"
 #include "sift_conf.h"
-#include "sift_extremum.h"
 #include "sift_constants.h"
+#include "sift_extremum.h"
+
+#include <iostream>
+#include <vector>
 
 namespace popsift {
 
@@ -24,44 +24,44 @@ struct LinearTexture
 
 class Octave
 {
-    int   _w;
-    int   _h;
-    int   _max_w;
-    int   _max_h;
-    float _w_grid_divider;
-    float _h_grid_divider;
-    int   _debug_octave_id;
-    int   _levels;
-    int   _gauss_group;
-
-    cudaArray_t           _data;
-    cudaChannelFormatDesc _data_desc;
-    cudaExtent            _data_ext;
-    cudaSurfaceObject_t   _data_surf;
-    cudaTextureObject_t   _data_tex_point;
-    LinearTexture         _data_tex_linear;
-
-    cudaArray_t           _intm;
-    cudaChannelFormatDesc _intm_desc;
-    cudaExtent            _intm_ext;
-    cudaSurfaceObject_t   _intm_surf;
-    cudaTextureObject_t   _intm_tex_point;
-    LinearTexture         _intm_tex_linear;
-
-    cudaArray_t           _dog_3d;
-    cudaChannelFormatDesc _dog_3d_desc;
-    cudaExtent            _dog_3d_ext;
-    cudaSurfaceObject_t   _dog_3d_surf;
-    cudaTextureObject_t   _dog_3d_tex_point;
-    LinearTexture         _dog_3d_tex_linear;
+    int   _w{};
+    int   _h{};
+    int   _max_w{};
+    int   _max_h{};
+    float _w_grid_divider{};
+    float _h_grid_divider{};
+    int   _debug_octave_id{};
+    int   _levels{};
+    int   _gauss_group{};
+
+    cudaArray_t           _data{};
+    cudaChannelFormatDesc _data_desc{};
+    cudaExtent            _data_ext{};
+    cudaSurfaceObject_t   _data_surf{};
+    cudaTextureObject_t   _data_tex_point{};
+    LinearTexture         _data_tex_linear{};
+
+    cudaArray_t           _intm{};
+    cudaChannelFormatDesc _intm_desc{};
+    cudaExtent            _intm_ext{};
+    cudaSurfaceObject_t   _intm_surf{};
+    cudaTextureObject_t   _intm_tex_point{};
+    LinearTexture         _intm_tex_linear{};
+
+    cudaArray_t           _dog_3d{};
+    cudaChannelFormatDesc _dog_3d_desc{};
+    cudaExtent            _dog_3d_ext{};
+    cudaSurfaceObject_t   _dog_3d_surf{};
+    cudaTextureObject_t   _dog_3d_tex_point{};
+    LinearTexture         _dog_3d_tex_linear{};
 
     // one CUDA stream per level
     // consider whether some of them can be removed
-    cudaStream_t _stream;
-    cudaEvent_t  _scale_done;
-    cudaEvent_t  _extrema_done;
-    cudaEvent_t  _ori_done;
-    cudaEvent_t  _desc_done;
+    cudaStream_t _stream{};
+    cudaEvent_t  _scale_done{};
+    cudaEvent_t  _extrema_done{};
+    cudaEvent_t  _ori_done{};
+    cudaEvent_t  _desc_done{};
 
 public:
     Octave( );
@@ -105,19 +105,19 @@ class Octave
     inline LinearTexture getIntermDataTexLinear( ) {
         return _intm_tex_linear;
     }
-    inline cudaTextureObject_t getIntermDataTexPoint( ) {
+    inline cudaTextureObject_t getIntermDataTexPoint( ) const {
         return _intm_tex_point;
     }
     inline LinearTexture getDataTexLinear( ) {
         return _data_tex_linear;
     }
-    inline cudaTextureObject_t getDataTexPoint( ) {
+    inline cudaTextureObject_t getDataTexPoint( ) const {
         return _data_tex_point;
     }
-    inline cudaSurfaceObject_t getDataSurface( ) {
+    inline cudaSurfaceObject_t getDataSurface( ) const {
         return _data_surf;
     }
-    inline cudaSurfaceObject_t getIntermediateSurface( ) {
+    inline cudaSurfaceObject_t getIntermediateSurface( ) const {
         return _intm_surf;
     }
         
@@ -131,10 +131,14 @@ class Octave
         return _dog_3d_tex_linear;
     }
 
-    /**
-     * alloc() - allocates all GPU memories for one octave
-     * @param width in floats, not bytes!!!
-     */
+     /**
+      * @brief Allocates all GPU memories for one octave.
+      * @param conf
+      * @param width in floats
+      * @param height
+      * @param levels
+      * @param gauss_group
+      */
     void alloc( const Config& conf,
                 int           width,
                 int           height,
diff --git a/src/popsift/sift_pyramid.cu b/src/popsift/sift_pyramid.cu
old mode 100755
new mode 100644
index 0079b109..c03b0d61
--- a/src/popsift/sift_pyramid.cu
+++ b/src/popsift/sift_pyramid.cu
@@ -5,25 +5,28 @@
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */
-#include <iostream>
+
+#include "common/assist.h"
+#include "common/debug_macros.h"
+#include "sift_config.h"
+#include "sift_extremum.h"
+#include "sift_pyramid.h"
+
+#include <sys/stat.h>
+
+#include <cstdio>
 #include <fstream>
+#include <iostream>
 #include <sstream>
 #include <vector>
-#include <stdio.h>
-#include <sys/stat.h>
 #ifdef _WIN32
 #include <direct.h>
 #define stat _stat
 #define mkdir(path, perm) _mkdir(path)
 #endif
 
-#include "sift_pyramid.h"
-#include "sift_extremum.h"
-#include "common/debug_macros.h"
-#include "common/assist.h"
-
-#ifdef USE_NVTX
-#include <nvToolsExtCuda.h>
+#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
+#include <nvtx3/nvToolsExtCuda.h>
 #else
 #define nvtxRangePushA(a)
 #define nvtxRangePop()
@@ -35,18 +38,15 @@ using namespace std;
 
 namespace popsift {
 
-__device__
-ExtremaCounters dct;
-ExtremaCounters hct;
+__device__ ExtremaCounters   dct;
+thread_local ExtremaCounters hct;
 
-__device__
-ExtremaBuffers  dbuf;
-ExtremaBuffers  dbuf_shadow; // just for managing memories
-ExtremaBuffers  hbuf;
+__device__ ExtremaBuffers   dbuf;
+thread_local ExtremaBuffers dbuf_shadow; // just for managing memories
+thread_local ExtremaBuffers hbuf;
 
-__device__
-DevBuffers      dobuf;
-DevBuffers      dobuf_shadow; // just for managing memories
+__device__ DevBuffers       dobuf;
+thread_local DevBuffers     dobuf_shadow; // just for managing memories
 
 __global__
     void py_print_corner_float(float* img, uint32_t pitch, uint32_t height, uint32_t level)
@@ -141,8 +141,8 @@ Pyramid::Pyramid( const Config& config,
         dobuf_shadow.i_ext_off[o] = dobuf_shadow.i_ext_off[0] + (o*h_consts.max_extrema);
     }
     for (int o = _num_octaves; o<MAX_OCTAVES; o++) {
-        dobuf_shadow.i_ext_dat[o] = 0;
-        dobuf_shadow.i_ext_off[o] = 0;
+        dobuf_shadow.i_ext_dat[o] = nullptr;
+        dobuf_shadow.i_ext_off[o] = nullptr;
     }
 
     sz = h_consts.max_extrema;
@@ -212,6 +212,7 @@ Pyramid::~Pyramid()
 {
     cudaStreamDestroy( _download_stream );
 
+    cudaFree(     _d_extrema_num_blocks );
     cudaFree(     dobuf_shadow.i_ext_dat[0] );
     cudaFree(     dobuf_shadow.i_ext_off[0] );
     cudaFree(     dobuf_shadow.features );
@@ -273,7 +274,7 @@ void prep_features( Descriptor* descriptor_base, int up_fac )
         fet.orientation[ori] = ext.orientation[ori];
     }
     for( ; ori<ORIENTATION_MAX_COUNT; ori++ ) {
-        fet.desc[ori]        = 0;
+        fet.desc[ori]        = nullptr;
         fet.orientation[ori] = 0;
     }
 }
@@ -287,7 +288,7 @@ FeaturesHost* Pyramid::get_descriptors( const Config& conf )
     nvtxRangePushA( "download descriptors" );
     FeaturesHost* features = new FeaturesHost( hct.ext_total, hct.ori_total );
 
-    if( hct.ext_total == 0 )
+    if( hct.ext_total == 0 || hct.ori_total == 0 )
     {
         nvtxRangePop();
         return features;
@@ -432,8 +433,9 @@ void Pyramid::writeDescriptor( const Config& conf, ostream& ostr, FeaturesHost*
                      << 1.0f / (sigma * sigma) << " ";
 
             if (really) {
-                for (int i = 0; i<128; i++) {
-                    ostr << desc.features[i] << " ";
+                for (float feature : desc.features)
+                {
+                    ostr << feature << " ";
                 }
             }
             ostr << endl;
diff --git a/src/popsift/sift_pyramid.h b/src/popsift/sift_pyramid.h
index 791fcc91..837fc3b1 100755
--- a/src/popsift/sift_pyramid.h
+++ b/src/popsift/sift_pyramid.h
@@ -7,16 +7,15 @@
  */
 #pragma once
 
-#include <iostream>
-#include <vector>
-
-#include "sift_conf.h"
-#include "sift_constants.h"
 #include "features.h"
-
 #include "s_image.h"
+#include "sift_conf.h"
+#include "sift_constants.h"
 #include "sift_octave.h"
 
+#include <iostream>
+#include <vector>
+
 namespace popsift {
 
 struct ExtremaCounters
@@ -51,13 +50,13 @@ struct DevBuffers
     Feature*         features;
 };
 
-extern            ExtremaCounters hct;
-extern __device__ ExtremaCounters dct;
-extern            ExtremaBuffers  hbuf;
-extern __device__ ExtremaBuffers  dbuf;
-extern            ExtremaBuffers  dbuf_shadow; // just for managing memories
-extern __device__ DevBuffers      dobuf;
-extern            DevBuffers      dobuf_shadow; // just for managing memories
+extern thread_local ExtremaCounters hct;
+extern __device__   ExtremaCounters dct;
+extern thread_local ExtremaBuffers  hbuf;
+extern __device__   ExtremaBuffers  dbuf;
+extern thread_local ExtremaBuffers  dbuf_shadow; // just for managing memories
+extern __device__   DevBuffers      dobuf;
+extern thread_local DevBuffers      dobuf_shadow; // just for managing memories
 
 class Pyramid
 {
@@ -152,9 +151,6 @@ class Pyramid
 
     void descriptors( const Config& conf );
 
-    void debug_out_floats  ( float* data, uint32_t pitch, uint32_t height );
-    void debug_out_floats_t( float* data, uint32_t pitch, uint32_t height );
-
     void readDescCountersFromDevice( );
     void readDescCountersFromDevice( cudaStream_t s );
     void writeDescCountersToDevice( );
@@ -164,11 +160,6 @@ class Pyramid
 
     void clone_device_descriptors_sub( const Config& conf, FeaturesDev* features );
 
-private:
-    // debug
-    void print_tables_host( );
-
-public:
 };
 
 } // namespace popsift
diff --git a/testScripts/CMakeLists.txt b/testScripts/CMakeLists.txt
new file mode 100755
index 00000000..411a8e30
--- /dev/null
+++ b/testScripts/CMakeLists.txt
@@ -0,0 +1,28 @@
+configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/downloadOxfordDataset.sh.in
+                ${CMAKE_CURRENT_BINARY_DIR}/downloadOxfordDataset.sh )
+
+configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/testOxfordDataset.sh.in
+                ${CMAKE_CURRENT_BINARY_DIR}/testOxfordDataset.sh )
+
+configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/TEST.sh.in
+                ${CMAKE_CURRENT_BINARY_DIR}/TEST.sh )
+
+add_custom_target(
+	prepare-test
+ 	COMMAND /bin/bash ${CMAKE_CURRENT_BINARY_DIR}/downloadOxfordDataset.sh
+ 	DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/downloadOxfordDataset.sh
+)
+
+add_custom_target(
+	run-test
+ 	COMMAND /bin/bash ${CMAKE_CURRENT_BINARY_DIR}/testOxfordDataset.sh
+ 	DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/testOxfordDataset.sh
+	DEPENDS popsift-demo
+)
+
+add_custom_target(
+	run-test-boat
+ 	COMMAND /bin/bash ${CMAKE_CURRENT_BINARY_DIR}/testOxfordDataset.sh boat
+ 	DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/testOxfordDataset.sh
+	DEPENDS popsift-demo
+)
diff --git a/testScripts/TEST.sh b/testScripts/TEST.sh.in
similarity index 79%
rename from testScripts/TEST.sh
rename to testScripts/TEST.sh.in
index 817d62f8..943f45eb 100755
--- a/testScripts/TEST.sh
+++ b/testScripts/TEST.sh.in
@@ -1,8 +1,8 @@
 #!/bin/bash
 
-# IMAGE=../../popsift-samples/sample/big_set/boat/img2.ppm
-IMAGE=../../popsift-samples/sample/big_set/boat/img3.ppm
-# IMAGE=./test-17x17.pgm
+IMAGE=@PopSift_TESTFILE_PATH@/boat/img3.ppm
+
+POPSIFT_DEMO_BIN=@CMAKE_RUNTIME_OUTPUT_DIRECTORY@/popsift-demo
 
 LOG=--log
 # LOG=
@@ -16,13 +16,13 @@ FILTER="--filter-max-extrema=2000 --filter-grid=2 --filter-sort=down"
 
 PARAMS="$LOG $GAUSS_MODE $SCALING $FILTER --popsift-mode --octaves=8 --threshold=0.04 --edge-threshold=10.0 --initial-blur=0.5"
 
-for mode in loop ; do
-# for mode in loop grid igrid notile ; do
+# for mode in loop ; do
+for mode in loop grid igrid notile ; do
 # for mode in igrid notile ; do
   echo "MODE: $mode"
-  echo "./popsift-demo $PARAMS --desc-mode=$mode --write-as-uchar --norm-multi=9 -i $IMAGE"
-  ./popsift-demo $PARAMS --desc-mode=$mode --write-as-uchar --norm-multi=9 -i $IMAGE
-  # ./popsift-demo $LOG --popsift-mode --desc-mode=$mode --octaves=8 --threshold=0.04 --edge-threshold=10.0 --initial-blur=0.5 --write-as-uchar --norm-multi=9 -i $IMAGE
+  echo "$POPSIFT_DEMO_BIN $PARAMS --desc-mode=$mode --write-as-uchar --norm-multi=9 -i $IMAGE"
+  $POPSIFT_DEMO_BIN $PARAMS --desc-mode=$mode --write-as-uchar --norm-multi=9 -i $IMAGE
+  # $POPSIFT_DEMO_BIN $LOG --popsift-mode --desc-mode=$mode --octaves=8 --threshold=0.04 --edge-threshold=10.0 --initial-blur=0.5 --write-as-uchar --norm-multi=9 -i $IMAGE
   sort -n output-features.txt > UML
   echo 128  >  output-features-$mode.txt
   wc -l UML >> output-features-$mode.txt
@@ -71,4 +71,3 @@ echo -n "grid vs notile:  "
 echo -n "igrid vs notile: "
 ~/GIT/github/popsift-samples/playground/build/compare-descfiles \
 	-q output-features-igrid.txt output-features-notile.txt
-
diff --git a/testScripts/downloadOxfordDataset.sh.in b/testScripts/downloadOxfordDataset.sh.in
new file mode 100644
index 00000000..66c1630e
--- /dev/null
+++ b/testScripts/downloadOxfordDataset.sh.in
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+cd @CMAKE_CURRENT_SOURCE_DIR@
+if [ ! -f reference.tgz ]
+then
+  echo -n "Fetching reference values. "
+  wget http://heim.ifi.uio.no/griff/LADIO/files/reference.tgz
+  echo "Done."
+fi
+
+echo "Making directory @CMAKE_SOURCE_DIR@/oxford"
+mkdir -p @CMAKE_SOURCE_DIR@/oxford
+echo "Changing to directory @CMAKE_SOURCE_DIR@/oxford"
+cd @CMAKE_SOURCE_DIR@/oxford
+
+for dataset in boat bikes trees graf wall bark leuven ubc
+do
+  if [ ! -d $dataset ]
+  then
+    echo "Directory $dataset does not exist - creating"
+    mkdir $dataset
+  else
+    echo -n "Directory $dataset exists. "
+  fi
+  cd $dataset
+  if [ ! -f img1.pgm ] &&  [ ! -f img1.ppm ]
+  then
+    echo "Image img1 in $dataset does not exist"
+    if [ ! -r $dataset.tar.gz ]
+    then
+      echo "tarfile does not exist, downloading"
+      wget http://www.robots.ox.ac.uk/~vgg/research/affine/det_eval_files/$dataset.tar.gz
+    else
+      echo -n "Tarfile exists. "
+    fi
+    echo "Unpacking tarfile."
+    tar zxf $dataset.tar.gz
+    rm -f $dataset.tar.gz
+  else
+    echo "File img1.pgm exists."
+    rm -f $dataset.tar.gz
+  fi
+  cd ..
+done
+
diff --git a/testScripts/testOxfordDataset.sh.in b/testScripts/testOxfordDataset.sh.in
new file mode 100644
index 00000000..eec048db
--- /dev/null
+++ b/testScripts/testOxfordDataset.sh.in
@@ -0,0 +1,159 @@
+#!/bin/bash
+
+cd @CMAKE_BINARY_DIR@
+
+if [ ! -d reference ]
+then
+  echo "Extracting reference images."
+  tar zxf @CMAKE_CURRENT_SOURCE_DIR@/reference.tgz
+  echo "Done."
+fi
+
+echo "Making directory @CMAKE_BINARY_DIR@/oxford"
+mkdir -p @CMAKE_BINARY_DIR@/oxford
+echo "Changing to directory @CMAKE_BINARY_DIR@/oxford"
+cd @CMAKE_BINARY_DIR@/oxford
+
+echo "The command line is $*"
+
+if [ $# -gt 0 ]
+then
+  dataset_list=$*
+else
+  dataset_list="boat bikes trees graf wall bark leuven ubc"
+fi
+
+for dataset in $dataset_list
+do
+  echo "Looking for dataset $dataset"
+  mkdir -p $dataset
+  cd $dataset
+  for img in img1 img2 img3 img4 img5 img6
+  do
+    if [ -f @PopSift_TESTFILE_PATH@/$dataset/$img.pgm ]
+    then
+      imgfile=@PopSift_TESTFILE_PATH@/$dataset/$img.pgm
+    elif [ -f @PopSift_TESTFILE_PATH@/$dataset/$img.ppm ]
+    then
+      imgfile=@PopSift_TESTFILE_PATH@/$dataset/$img.ppm
+    else
+      continue
+    fi
+    echo "Looking for image file $imgfile"
+    if [ -d output-$img ]
+    then
+      echo "Directory output-$img exists. Skipping."
+      continue
+    fi
+    @CMAKE_RUNTIME_OUTPUT_DIRECTORY@/popsift-demo --log --gauss-mode vlfeat --desc-mode loop --popsift-mode --root-sift --downsampling -1 -i $imgfile
+    if [ $? != 0 ]
+    then
+      echo "Running popsift on $imgfile failed."
+      echo "Stopping."
+      exit
+    fi
+    rm -rf output-$img
+    mkdir -p output-$img/dir-octave
+    mkdir -p output-$img/dir-dog
+
+    echo -n "Moving ... "
+    echo -n "Image pyramid. "
+    mv dir-octave/* output-$img/dir-octave/
+    echo    "DoG pyramid. "
+    mv dir-dog/* output-$img/dir-dog/
+
+    echo -n "Sorting ... "
+    echo -n "Keypoints with descriptors. "
+    sort -n output-features.txt > output-$img/features.txt
+    echo -n "Keypoints. "
+    sort -n dir-fpt/desc-pyramid.txt > output-$img/keypoints.txt
+    echo    "Descriptors. "
+    sort -n dir-desc/desc-pyramid.txt > output-$img/descriptors.txt
+    rm -rf dir-desc dir-dog dir-dog-dump dir-dog-txt dir-fpt dir-octave dir-octave-dump output-features.txt
+  done
+  cd ..
+done
+
+for dataset in $dataset_list
+do
+  for img in img1 img2 img3 img4 img5 img6
+  do
+    if [ -d $dataset/output-$img ]
+    then
+      echo -n "Compare $dataset $img with reference. "
+
+      echo -n "Image pyramid "
+      cd @CMAKE_BINARY_DIR@/oxford/$dataset/output-$img/dir-octave
+      bad_image_matches=0
+      for i in *
+      do
+      	cmp @CMAKE_BINARY_DIR@/reference/$dataset/output-$img/dir-octave/$i $i
+	if [ $? != 0 ]
+	then
+          if [ $bad_image_matches == 0 ]
+          then
+	    echo ""
+	  fi
+	  echo "pyramid compare: @CMAKE_BINARY_DIR@/reference/$dataset/output-$img/dir-octave/$i and $i differ"
+	  bad_image_matches=1
+	fi
+      done
+      if [ $bad_image_matches == 0 ] ; then echo -n "OK. "; fi
+      cd @CMAKE_BINARY_DIR@/oxford
+
+      echo -n "DoG pyramid "
+      cd @CMAKE_BINARY_DIR@/oxford/$dataset/output-$img/dir-dog
+      bad_image_matches=0
+      for i in *
+      do
+      	cmp @CMAKE_BINARY_DIR@/reference/$dataset/output-$img/dir-dog/$i $i
+	if [ $? != 0 ]
+	then
+          if [ $bad_image_matches == 0 ]
+          then
+	    echo ""
+	  fi
+	  echo "DoG compare: @CMAKE_BINARY_DIR@/reference/$dataset/output-$img/dir-dog/$i and $i differ"
+	  bad_image_matches=1
+	fi
+      done
+      if [ $bad_image_matches == 0 ] ; then echo -n "OK. "; fi
+      cd @CMAKE_BINARY_DIR@/oxford
+
+      echo ""
+    fi
+  done
+done
+
+for dataset in $dataset_list
+do
+  # cd @CMAKE_BINARY_DIR@/oxford
+  for img in img1 img2 img3 img4 img5 img6
+  do
+    dir_prefix=@CMAKE_BINARY_DIR@/oxford/$dataset/output-$img
+    if [ -d $dataset/output-$img ]
+    then
+
+      echo -n "Compare $dataset $img with reference. "
+
+      echo -n "Features "
+      cmp -s @CMAKE_BINARY_DIR@/reference/$dataset/output-$img/features.txt $dir_prefix/features.txt
+      if [ $? == 1 ]
+      then
+        echo "BAD. "
+	echo cmp "@CMAKE_BINARY_DIR@/reference/$dataset/output-$img/features.txt $dir_prefix/features.txt"
+	diff @CMAKE_BINARY_DIR@/reference/$dataset/output-$img/features.txt $dir_prefix/features.txt | wc -l
+      else echo -n "OK. " ; fi
+
+      echo -n "Keypoints "
+      cmp -s @CMAKE_BINARY_DIR@/reference/$dataset/output-$img/keypoints.txt $dir_prefix/keypoints.txt
+      if [ $? == 1 ] ; then echo -n "BAD. " ; else echo -n "OK. " ; fi
+
+      echo -n "Descriptors "
+      cmp -s @CMAKE_BINARY_DIR@/reference/$dataset/output-$img/descriptors.txt $dir_prefix/descriptors.txt
+      if [ $? == 1 ] ; then echo -n "BAD. " ; else echo -n "OK. " ; fi
+
+      echo ""
+    fi
+  done
+done