SC-SGS · breyerml · Oct 2, 2024 · Oct 2, 2024 · Oct 2, 2024 · Oct 7, 2024
diff --git a/include/plssvm/backends/CUDA/csvm.hpp b/include/plssvm/backends/CUDA/csvm.hpp
@@ -22,6 +22,7 @@
 #include "plssvm/detail/memory_size.hpp"                  // plssvm::detail::memory_size
 #include "plssvm/detail/type_traits.hpp"                  // PLSSVM_REQUIRES
 #include "plssvm/parameter.hpp"                           // plssvm::parameter
+#include "plssvm/solver_types.hpp"                        // plssvm::solver_type
 #include "plssvm/target_platforms.hpp"                    // plssvm::target_platform
 
 #include <cstddef>      // std::size_t
@@ -152,7 +153,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, int, detail::
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */

diff --git a/include/plssvm/backends/CUDA/detail/device_ptr.cuh b/include/plssvm/backends/CUDA/detail/device_ptr.cuh
@@ -32,6 +32,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, T *, device_p
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -58,24 +59,27 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, T *, device_p
      * @brief Allocates `size * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] device the associated CUDA device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    explicit device_ptr(size_type size, queue_type device);
+    device_ptr(size_type size, queue_type device, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] device the associated CUDA device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    explicit device_ptr(plssvm::shape shape, queue_type device);
+    device_ptr(plssvm::shape shape, queue_type device, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
      * @param[in] device the associated CUDA device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, queue_type device);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, queue_type device, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)

diff --git a/include/plssvm/backends/HIP/csvm.hpp b/include/plssvm/backends/HIP/csvm.hpp
@@ -156,7 +156,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, int, detail::
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */

diff --git a/include/plssvm/backends/HIP/detail/device_ptr.hip.hpp b/include/plssvm/backends/HIP/detail/device_ptr.hip.hpp
@@ -32,6 +32,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, T *, device_p
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -58,24 +59,27 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, T *, device_p
      * @brief Allocates `size * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] device the associated HIP device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::hip::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    explicit device_ptr(size_type size, queue_type device);
+    explicit device_ptr(size_type size, queue_type device, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] device the associated HIP device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::hip::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    explicit device_ptr(plssvm::shape shape, queue_type device);
+    explicit device_ptr(plssvm::shape shape, queue_type device, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
-     * @param[in] device the associated CUDA device
+     * @param[in] device the associated HIP device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, queue_type device);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, queue_type device, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)

diff --git a/include/plssvm/backends/OpenCL/csvm.hpp b/include/plssvm/backends/OpenCL/csvm.hpp
@@ -155,7 +155,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::comma
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */

diff --git a/include/plssvm/backends/OpenCL/detail/context.hpp b/include/plssvm/backends/OpenCL/detail/context.hpp
@@ -15,13 +15,12 @@
 
 #include "CL/cl.h"  // cl_context, cl_platform_id, cl_device_id
 
-#include <vector>  // std::vector
-
 namespace plssvm::opencl::detail {
 
 /**
  * @brief RAII wrapper class around a cl_context.
- * @details Also contains the associated platform and a list of all associated devices.
+ * @details Also contains the associated platform and device.
+ * @note Each context is guaranteed to only contain a single device, i.e., on multi-device system, one context for each device is created.
  */
 class context {
   public:
@@ -35,7 +34,7 @@ class context {
      * @param[in] platform the OpenCL platform associated with this OpenCL context
      * @param[in] devices the list of devices associated with this OpenCL cl_context
      */
-    context(cl_context device_context, cl_platform_id platform, std::vector<cl_device_id> devices);
+    context(cl_context device_context, cl_platform_id platform, cl_device_id device);
 
     /**
      * @brief Delete copy-constructor to make context a move only type.
@@ -78,8 +77,8 @@ class context {
     cl_context device_context{};
     /// The OpenCL platform associated with this context.
     cl_platform_id platform{};
-    /// All devices associated with this context.
-    std::vector<cl_device_id> devices{};
+    /// The device associated with this context.
+    cl_device_id device{};
 };
 
 }  // namespace plssvm::opencl::detail

diff --git a/include/plssvm/backends/OpenCL/detail/device_ptr.hpp b/include/plssvm/backends/OpenCL/detail/device_ptr.hpp
@@ -20,6 +20,7 @@
 #include "CL/cl.h"  // cl_mem
 
 #include <cstddef>  // std::size_t
+#include <variant>  // std::variant
 
 namespace plssvm::opencl::detail {
 
@@ -28,13 +29,14 @@ namespace plssvm::opencl::detail {
  * @tparam T the type of the kernel pointer to wrap
  */
 template <typename T>
-class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, const command_queue *, cl_mem, device_ptr<T>> {
+class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, const command_queue *, std::variant<cl_mem, T*>, device_ptr<T>> {
     /// The template base type of the OpenCL device_ptr class.
-    using base_type = ::plssvm::detail::gpu_device_ptr<T, const command_queue *, cl_mem, device_ptr<T>>;
+    using base_type = ::plssvm::detail::gpu_device_ptr<T, const command_queue *, std::variant<cl_mem, T*>, device_ptr<T>>;
 
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -60,21 +62,24 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, const command_queu
      * @brief Allocates `size * sizeof(T)` bytes on the device associated with @p queue.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] queue the associated command queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(size_type size, const command_queue &queue);
+    device_ptr(size_type size, const command_queue &queue, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device associated with @p queue.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] queue the associated command queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, const command_queue &queue);
+    device_ptr(plssvm::shape shape, const command_queue &queue, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device associated with @p queue.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
      * @param[in] queue the associated command queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, const command_queue &queue);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, const command_queue &queue, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)

diff --git a/include/plssvm/backends/OpenCL/detail/utility.hpp b/include/plssvm/backends/OpenCL/detail/utility.hpp
@@ -20,17 +20,20 @@
 #include "plssvm/backends/OpenCL/detail/kernel.hpp"         // plssvm::opencl::detail::compute_kernel_name
 #include "plssvm/backends/OpenCL/exceptions.hpp"            // plssvm::opencl::backend_exception
 #include "plssvm/detail/assert.hpp"                         // PLSSVM_ASSERT
+#include "plssvm/detail/type_list.hpp"                      // plssvm::detail::{remove_cvref_t, is_variant_v}
+#include "plssvm/detail/utility.hpp"                        // plssvm::detail::visit_overload
 #include "plssvm/kernel_function_types.hpp"                 // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                      // plssvm::target_platform
 
-#include "CL/cl.h"  // cl_uint, cl_int, clSetKernelArg, clEnqueueNDRangeKernel, clFinish
+#include "CL/cl.h"  // cl_uint, cl_int, clSetKernelArg, clSetKernelArgSVMPointer, clEnqueueNDRangeKernel, clFinish
 
 #include "fmt/format.h"  // fmt::format
 
 #include <cstddef>      // std::size_t
 #include <string>       // std::string
 #include <string_view>  // std::string_view
 #include <utility>      // std::forward, std::pair
+#include <variant>      // std::variant, std::visit
 #include <vector>       // std::vector
 
 /**
@@ -141,7 +144,17 @@ inline void set_kernel_args(cl_kernel kernel, Args... args) {
     cl_uint i = 0;
     // iterate over parameter pack and set OpenCL kernel
     ([&](auto &arg) {
-        const error_code ec = clSetKernelArg(kernel, i++, sizeof(decltype(arg)), &arg);
+        error_code ec{};
+        // check if we have to set a variant value
+        if constexpr (::plssvm::detail::is_variant_v<::plssvm::detail::remove_cvref_t<decltype(arg)>>) {
+            std::visit(::plssvm::detail::visit_overload{
+                           [&](cl_mem &kernel_arg) { ec = clSetKernelArg(kernel, i++, sizeof(decltype(kernel_arg)), &kernel_arg); },
+                           [&](auto &kernel_arg) { ec = clSetKernelArgSVMPointer(kernel, i++, kernel_arg); } },
+                       arg);
+        } else {
+            // set kernel argument normally
+            ec = clSetKernelArg(kernel, i++, sizeof(decltype(arg)), &arg);
+        }
         PLSSVM_OPENCL_ERROR_CHECK(ec, fmt::format("error setting OpenCL kernel argument {}", i - 1))
     }(args),
      ...);

diff --git a/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp b/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp
@@ -168,7 +168,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */

diff --git a/include/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp b/include/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp
@@ -33,6 +33,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, queue, T *, device
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -58,21 +59,24 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, queue, T *, device
      * @brief Allocates `size * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(size_type size, const queue &q);
+    device_ptr(size_type size, const queue &q, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, const queue &q);
+    device_ptr(plssvm::shape shape, const queue &q, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, const queue &q);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, const queue &q, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)

diff --git a/include/plssvm/backends/SYCL/DPCPP/csvm.hpp b/include/plssvm/backends/SYCL/DPCPP/csvm.hpp
@@ -168,7 +168,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */

diff --git a/include/plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp b/include/plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp
@@ -33,6 +33,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, queue, T *, device
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -58,21 +59,24 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, queue, T *, device
      * @brief Allocates `size * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(size_type size, const queue &q);
+    device_ptr(size_type size, const queue &q, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, const queue &q);
+    device_ptr(plssvm::shape shape, const queue &q, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, const queue &q);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, const queue &q, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)