From af2e8843a2395f20edcee6cb09047be62628c3eb Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Mon, 25 Nov 2024 17:08:54 +0900
Subject: [PATCH 01/38] feat: moved the cuda pointcloud preprocessor and
 organized from a personal repository

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../CMakeLists.txt                            | 100 +++
 .../README.md                                 |  20 +
 .../cuda_pointcloud_preprocessor.param.yaml   |  22 +
 .../docs/cuda-organized-pointcloud-adapter.md |  41 ++
 .../docs/cuda-pointcloud-preprocessor.md      |  46 ++
 ...cuda_organized_pointcloud_adapter_node.hpp |  77 ++
 .../cuda_pointcloud_preprocessor.hpp          | 154 ++++
 .../cuda_pointcloud_preprocessor_node.hpp     | 114 +++
 .../point_types.hpp                           |  50 ++
 .../cuda_pointcloud_preprocessor.launch.xml   |  16 +
 .../package.xml                               |  32 +
 .../cuda_pointcloud_preprocessor.schema.json  | 140 ++++
 ...cuda_organized_pointcloud_adapter_node.cpp | 203 +++++
 .../cuda_pointcloud_preprocessor.cu           | 692 ++++++++++++++++++
 .../cuda_pointcloud_preprocessor_node.cpp     | 253 +++++++
 15 files changed, 1960 insertions(+)
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/README.md
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/package.xml
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
new file mode 100644
index 0000000000000..06b1f5d67540f
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
@@ -0,0 +1,100 @@
+cmake_minimum_required(VERSION 3.14)
+project(autoware_cuda_pointcloud_preprocessor)
+
+find_package(ament_cmake_auto REQUIRED)
+find_package(CUDA)
+find_package(PCL REQUIRED)
+
+if(NOT ${CUDA_FOUND})
+  message(WARNING "cuda was not found, so the autoware_cuda_pointcloud_preprocessor package will not be built.")
+  return()
+endif()
+
+ament_auto_find_build_dependencies()
+
+# Default to C++17
+if (NOT CMAKE_CXX_STANDARD)
+    set(CMAKE_CXX_STANDARD 17)
+endif ()
+
+if (CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    add_compile_options(-Wall -Wextra -Wpedantic -Wunused-function)
+endif ()
+
+if(BUILD_TESTING)
+    find_package(ament_lint_auto REQUIRED)
+    ament_lint_auto_find_test_dependencies()
+endif()
+
+include_directories(
+    include
+    SYSTEM
+    ${CUDA_INCLUDE_DIRS}
+)
+
+list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 20012")
+
+cuda_add_library(cuda_pointcloud_preprocessor_lib SHARED
+  src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+)
+
+target_link_libraries(cuda_pointcloud_preprocessor_lib
+  ${PCL_LIBRARIES}
+)
+
+target_include_directories(cuda_pointcloud_preprocessor_lib SYSTEM PRIVATE
+  ${autoware_point_types_INCLUDE_DIRS}
+  ${cuda_blackboard_INCLUDE_DIRS}
+  ${geometry_msgs_INCLUDE_DIRS}
+  ${nebula_common_INCLUDE_DIRS}
+  ${rclcpp_INCLUDE_DIRS}
+  ${rcl_interfaces_INCLUDE_DIRS}
+  ${sensor_msgs_INCLUDE_DIRS}
+  ${tf2_INCLUDE_DIRS}
+)
+
+# Targets
+ament_auto_add_library(cuda_organized_pointcloud_adapter SHARED
+  src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
+)
+
+target_link_libraries(cuda_organized_pointcloud_adapter
+  ${CUDA_LIBRARIES}
+)
+
+ament_auto_add_library(cuda_pointcloud_preprocessor SHARED
+  src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
+)
+
+target_link_libraries(cuda_pointcloud_preprocessor
+  ${CUDA_LIBRARIES}
+  cuda_pointcloud_preprocessor_lib
+)
+
+rclcpp_components_register_node(cuda_organized_pointcloud_adapter
+  PLUGIN "autoware::cuda_organized_pointcloud_adapter::CudaOrganizedPointcloudAdapterNode"
+  EXECUTABLE cuda_organized_pointcloud_adapter_node
+)
+
+rclcpp_components_register_node(cuda_pointcloud_preprocessor
+  PLUGIN "autoware::cuda_pointcloud_preprocessor::CudaPointcloudPreprocessorNode"
+  EXECUTABLE cuda_pointcloud_preprocessor_node
+)
+
+install(DIRECTORY launch config
+  DESTINATION share/${PROJECT_NAME}
+)
+
+ament_auto_package()
+
+# Set ROS_DISTRO macros
+set(ROS_DISTRO $ENV{ROS_DISTRO})
+if(${ROS_DISTRO} STREQUAL "rolling")
+    add_compile_definitions(ROS_DISTRO_ROLLING)
+elseif(${ROS_DISTRO} STREQUAL "foxy")
+    add_compile_definitions(ROS_DISTRO_FOXY)
+elseif(${ROS_DISTRO} STREQUAL "galactic")
+    add_compile_definitions(ROS_DISTRO_GALACTIC)
+elseif(${ROS_DISTRO} STREQUAL "humble")
+    add_compile_definitions(ROS_DISTRO_HUMBLE)
+endif()
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/README.md b/sensing/autoware_cuda_pointcloud_preprocessor/README.md
new file mode 100644
index 0000000000000..d15082b69a661
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/README.md
@@ -0,0 +1,20 @@
+# autoware_cuda_pointcloud_preprocessor
+
+## Purpose
+
+The pointcloud preprocessing implemented in `autoware_pointcloud_preprocessor` has been thoroughly tested in autoware. However, the latency it introduces does not scale well with modern LiDAR devices due to the high number of points they introduce.
+
+To alleviate this issue, this package reimplements most of the pipeline presented in `autoware_pointcloud_preprocessor` leveraging the use of GPGPUs. In particular, this package makes use of CUDA to provide accelerated versions of the already establishes implementations, while also maintaining compatibility with normal ROS nodes/topics.
+
+## Inner-workings / Algorithms
+
+A detailed description of each filter's algorithm is available in the following links.
+
+| Filter Name                       | Description                                                                                                                                  | Detail                                            |
+| --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------- |
+| cuda_organized_pointcloud_adapter | Organizes a pointcloud per ring/channel, so that the memory layout allows parallel processing in cuda                                        | [link](docs/cuda-organized-pointcloud-adapter.md) |
+| cuda_pointcloud_preprocessor      | Implements the cropping, distortion correction, and outlier filtering (ring-based) of the `autoware_pointcloud_preprocessor`'s cpu versions. | [link](docs/cuda-pointcloud-preprocessor.md)      |
+
+## (Optional) Future extensions / Unimplemented parts
+
+The subsample filters implemented in `autoware_pointcloud_preprocessor` will have similar counterparts in this package.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml b/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml
new file mode 100644
index 0000000000000..22358cfadf2a7
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml
@@ -0,0 +1,22 @@
+/**:
+  ros__parameters:
+
+    base_frame: base_link
+    use_imu: true
+    use_3d_distortion_correction: false
+    distance_ratio: 1.03
+    object_length_threshold: 0.1
+    num_points_threshold: 4
+
+    self_crop.min_x: 1.0
+    self_crop.min_y: 1.0
+    self_crop.min_z: 1.0
+    self_crop.max_x: -1.0
+    self_crop.max_y: -1.0
+    self_crop.max_z: -1.0
+    mirror_crop.min_x: 1.0
+    mirror_crop.min_y: 1.0
+    mirror_crop.min_z: 1.0
+    mirror_crop.max_x: -1.0
+    mirror_crop.max_y: -1.0
+    mirror_crop.max_z: -1.0
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md
new file mode 100644
index 0000000000000..634cdfb3e14f0
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md
@@ -0,0 +1,41 @@
+# cuda_organized_pointcloud_adapter
+
+## Purpose
+
+The node `cuda_pointcloud_preprocessor` expects a 2D pointcloud where every row represents a single channel/ring and each row's points are in non decreasing azimuth order.
+
+To utilize the previously mentioned node, this node provides an adapter to convert standard flat pointclouds (`height` equals to 1) to the required 2D tensor.
+
+In addition, this node uses the `cuda_blackboard`, a cuda transport layer that enables a zero-copy mechanism between GPU and GPU memory for both input and output.
+
+## Inner-workings / Algorithms
+
+To create the required 2D tensor, this node iterates the input pointcloud sequentially, filling the output 2D tensor depending on the input point's channel.
+
+The output tensor's size is also estimated in this node, based on the largest `channel` value and the maximum number of points per channel observed so far.
+
+## Inputs / Outputs
+
+### Input
+
+| Name                 | Type                            | Description               |
+| -------------------- | ------------------------------- | ------------------------- |
+| `~/input/pointcloud` | `sensor_msgs::msg::PointCloud2` | Input pointcloud's topic. |
+
+### Output
+
+| Name                       | Type                                             | Description                              |
+| -------------------------- | ------------------------------------------------ | ---------------------------------------- |
+| `~/output/pointcloud`      | `sensor_msgs::msg::PointCloud2`                  | Processed pointcloud's topic             |
+| `~/output/pointcloud/cuda` | `negotiated_interfaces/msg/NegotiatedTopicsInfo` | Processed pointcloud's negotiation topic |
+
+## Parameters
+
+### Core Parameters
+
+{{ json_to_markdown("sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.schema.json") }}
+
+## Assumptions / Known limits
+
+- This algorithm assumes that the input points will be in non-decreasing azimuth order (per ring).
+- This node expects that the input pointcloud is flat (`height` equals to 1) and follows the `autoware::point_types::PointXYZIRC` layout defined in the `autoware_point_types` package.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
new file mode 100644
index 0000000000000..459f15569d7ca
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
@@ -0,0 +1,46 @@
+# cuda_pointcloud_preprocessor
+
+## Purpose
+
+This node implements all standard pointcloud preprocessing algorithms applied to a single LiDAR's pointcloud in CUDA.
+In particular, this node implements:
+
+- crop boxing (ego-vehicle and ego-vehicle's mirrors)
+- distortion correction
+- ring-based outlier filtering
+
+## Inner-workings / Algorithms
+
+As this node reimplements the functionalities of the CPU-version algorithms, please have a look at the documentations of [crop-box](/sensing/autoware_pointcloud_preprocessor/docs/crop-box-filter.md), [distortion correction](/sensing/autoware_pointcloud_preprocessor/docs/distortion-corrector.md), and [ring-based outlier filter](/sensing/autoware_pointcloud_preprocessor/docs/ring-outlier-filter.md) for more information about these algorithms.
+
+In addition to the individual algorithms previously mentioned, this node uses the `cuda_blackboard`, a cuda transport layer that enables a zero-copy mechanism between GPU and GPU memory for both input and output.
+
+## Inputs / Outputs
+
+### Input
+
+| Name                      | Type                                             | Description                               |
+| ------------------------- | ------------------------------------------------ | ----------------------------------------- |
+| `~/input/pointcloud`      | `sensor_msgs::msg::PointCloud2`                  | Input pointcloud's topic.                 |
+| `~/input/pointcloud/cuda` | `negotiated_interfaces/msg/NegotiatedTopicsInfo` | Input pointcloud's type negotiation topic |
+| `~/input/twist`           | `geometry_msgs::msg::TwistWithCovarianceStamped` | Topic of the twist information.           |
+| `~/input/imu`             | `sensor_msgs::msg::Imu`                          | Topic of the IMU data.                    |
+
+### Output
+
+| Name                       | Type                                             | Description                              |
+| -------------------------- | ------------------------------------------------ | ---------------------------------------- |
+| `~/output/pointcloud`      | `sensor_msgs::msg::PointCloud2`                  | Processed pointcloud's topic             |
+| `~/output/pointcloud/cuda` | `negotiated_interfaces/msg/NegotiatedTopicsInfo` | Processed pointcloud's negotiation topic |
+
+## Parameters
+
+### Core Parameters
+
+{{ json_to_markdown("sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.schema.json") }}
+
+## Assumptions / Known limits
+
+- The CUDA implementations, while following the original CPU ones, will not offer the same numerical results, and small approximations were needed to maximize the GPU use.
+- This node expects that the input pointcloud follows the `autoware::point_types::PointXYZIRCAEDT` layout and the output pointcloud will use the `autoware::point_types::PointXYZIRC` layout defined in the `autoware_point_types` package.
+- The input pointcloud must be a 2D tensor where each row represents a different channel/ring with its points in non-decreasing azimuth order. Invalid points should contain 0-values. The `cuda_organized_pointcloud_adapter` provides such a pointcloud.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp
new file mode 100644
index 0000000000000..8930a1f87e63b
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp
@@ -0,0 +1,77 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_
+#define AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_
+
+#include <autoware/point_types/types.hpp>
+#include <autoware/universe_utils/ros/debug_publisher.hpp>
+#include <autoware/universe_utils/system/stop_watch.hpp>
+#include <cuda_blackboard/cuda_adaptation.hpp>
+#include <cuda_blackboard/cuda_blackboard_publisher.hpp>
+#include <cuda_blackboard/cuda_pointcloud2.hpp>
+#include <rclcpp/rclcpp.hpp>
+
+#include <sensor_msgs/msg/point_cloud2.hpp>
+
+#include <tf2/transform_datatypes.h>
+#include <tf2_ros/buffer.h>
+#include <tf2_ros/transform_listener.h>
+
+#include <chrono>
+#include <deque>
+#include <memory>
+#include <vector>
+
+namespace autoware::cuda_organized_pointcloud_adapter
+{
+
+class CudaOrganizedPointcloudAdapterNode : public rclcpp::Node
+{
+public:
+  explicit CudaOrganizedPointcloudAdapterNode(const rclcpp::NodeOptions & node_options);
+  ~CudaOrganizedPointcloudAdapterNode() = default;
+
+private:
+  void estimatePointcloudRingInfo(
+    const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr);
+
+  bool orderPointcloud(
+    const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr);
+
+  // Callback
+  void pointcloudCallback(
+    const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr);
+
+  // Subscriber
+  rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::SharedPtr pointcloud_sub_{};
+
+  // CUDA pub
+  std::unique_ptr<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>> pub_;
+
+  std::size_t num_rings_{0};
+  std::size_t max_points_per_ring_{0};
+
+  std::vector<std::size_t> next_ring_index_;
+  std::vector<autoware::point_types::PointXYZIRCAEDT> buffer_;
+  // autoware::point_types::PointXYZIRCAEDT * device_buffer_;
+  cuda_blackboard::CudaUniquePtr<std::uint8_t[]> device_buffer_;
+
+  std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_;
+  std::unique_ptr<autoware::universe_utils::DebugPublisher> debug_publisher_;
+};
+
+}  // namespace autoware::cuda_organized_pointcloud_adapter
+
+#endif  // AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
new file mode 100644
index 0000000000000..b4ae4eecc304e
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
@@ -0,0 +1,154 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_HPP_
+#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_HPP_
+
+#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
+
+#include <Eigen/Core>
+#include <Eigen/Geometry>
+#include <cuda_blackboard/cuda_pointcloud2.hpp>
+
+#include <geometry_msgs/msg/transform_stamped.hpp>
+#include <geometry_msgs/msg/twist_with_covariance_stamped.hpp>
+#include <geometry_msgs/msg/vector3_stamped.hpp>
+#include <sensor_msgs/msg/point_cloud2.hpp>
+
+#include <thrust/device_vector.h>
+
+#include <cmath>
+#include <deque>
+
+namespace autoware::cuda_pointcloud_preprocessor
+{
+
+struct TwistStruct2D
+{
+  float cum_x;
+  float cum_y;
+  float cum_theta;
+  float cum_cos_theta;
+  float cum_sin_theta;
+  uint32_t last_stamp_nsec;  // relative to the start of the pointcloud
+  uint32_t stamp_nsec;       // relative to the start of the pointcloud
+  float vx;
+  float vtheta;
+};
+
+struct TwistStruct3D
+{
+  // Eigen::Affine3f cum_transform;
+  float cum_transform_buffer[16];
+  /* float cum_x;
+  float cum_y;
+  float cum_theta; */
+  /* float cum_cos_theta;
+  float cum_sin_theta; */
+  uint32_t last_stamp_nsec;  // relative to the start of the pointcloud
+  uint32_t stamp_nsec;       // relative to the start of the pointcloud
+  /* float vx;
+  float vtheta; */
+  float v[3];
+  float w[3];
+};
+
+struct TransformStruct
+{
+  float x;
+  float y;
+  float z;
+  float m11;
+  float m12;
+  float m13;
+  float m21;
+  float m22;
+  float m23;
+  float m31;
+  float m32;
+  float m33;
+};
+
+struct CropBoxParameters
+{
+  float min_x{std::nanf("")};
+  float max_x{std::nanf("")};
+  float min_y{std::nanf("")};
+  float max_y{std::nanf("")};
+  float min_z{std::nanf("")};
+  float max_z{std::nanf("")};
+};
+
+struct RingOutlierFilterParameters
+{
+  float distance_ratio{std::nanf("")};
+  float object_length_threshold{std::nanf("")};
+  std::size_t num_points_threshold{0};
+};
+
+class CudaPointcloudPreprocessor
+{
+public:
+  CudaPointcloudPreprocessor();
+  ~CudaPointcloudPreprocessor() = default;
+
+  void setCropBoxParameters(
+    const CropBoxParameters & self_crop_box_parameters,
+    const CropBoxParameters & mirror_crop_box_parameters);
+  void setRingOutlierFilterParameters(const RingOutlierFilterParameters & ring_outlier_parameters);
+  void set3DUndistortion(bool value);
+
+  void preallocateOutput();
+
+  std::unique_ptr<cuda_blackboard::CudaPointCloud2> process(
+    const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg,
+    const geometry_msgs::msg::TransformStamped & transform_msg,
+    const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
+    const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue);
+
+private:
+  void setupTwist2DStructs(
+    const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg,
+    const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
+    const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue);
+
+  void setupTwist3DStructs(
+    const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg,
+    const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
+    const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue);
+
+  CropBoxParameters self_crop_box_parameters_{};
+  CropBoxParameters mirror_crop_box_parameters_{};
+  RingOutlierFilterParameters ring_outlier_parameters_{};
+  bool use_3d_undistortion_{false};
+
+  int max_rings_{};
+  int max_points_per_ring_{};
+
+  std::vector<sensor_msgs::msg::PointField> point_fields_{};
+  std::unique_ptr<cuda_blackboard::CudaPointCloud2> output_pointcloud_ptr_{};
+
+  thrust::device_vector<InputPointType> device_transformed_points_{};
+  thrust::device_vector<OutputPointType> device_output_points_{};
+  thrust::device_vector<uint32_t> device_self_crop_mask_{};
+  thrust::device_vector<uint32_t> device_mirror_crop_mask_{};
+  thrust::device_vector<uint32_t> device_ring_outlier_mask_{};
+  thrust::device_vector<uint32_t> device_indices_{};
+  thrust::device_vector<TwistStruct2D> device_twist_2d_structs_{};
+  thrust::device_vector<TwistStruct3D> device_twist_3d_structs_{};
+};
+
+}  // namespace autoware::cuda_pointcloud_preprocessor
+
+#endif  // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_HPP_
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
new file mode 100644
index 0000000000000..5fdf4258eebcb
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
@@ -0,0 +1,114 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_NODE_HPP_
+#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_NODE_HPP_
+
+#include "autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
+
+#include <autoware/point_types/types.hpp>
+#include <autoware/universe_utils/ros/debug_publisher.hpp>
+#include <autoware/universe_utils/system/stop_watch.hpp>
+#include <cuda_blackboard/cuda_adaptation.hpp>
+#include <cuda_blackboard/cuda_blackboard_publisher.hpp>
+#include <cuda_blackboard/cuda_blackboard_subscriber.hpp>
+#include <cuda_blackboard/cuda_pointcloud2.hpp>
+#include <rclcpp/rclcpp.hpp>
+
+#include <geometry_msgs/msg/twist_with_covariance_stamped.hpp>
+#include <geometry_msgs/msg/vector3_stamped.hpp>
+#include <sensor_msgs/msg/imu.hpp>
+#include <sensor_msgs/msg/point_cloud2.hpp>
+
+#include <tf2/transform_datatypes.h>
+#include <tf2_ros/buffer.h>
+#include <tf2_ros/transform_listener.h>
+
+#include <chrono>
+#include <deque>
+#include <memory>
+#include <vector>
+
+#define CHECK_OFFSET(structure1, structure2, field)             \
+  static_assert(                                                \
+    offsetof(structure1, field) == offsetof(structure2, field), \
+    "Offset of " #field " in " #structure1 " does not match expected offset.")
+
+namespace autoware::cuda_pointcloud_preprocessor
+{
+
+static_assert(sizeof(InputPointType) == sizeof(autoware::point_types::PointXYZIRCAEDT));
+static_assert(sizeof(OutputPointType) == sizeof(autoware::point_types::PointXYZIRC));
+
+CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, x);
+CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, y);
+CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, z);
+CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, intensity);
+CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, return_type);
+CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, channel);
+CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, azimuth);
+CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, elevation);
+CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, distance);
+CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, time_stamp);
+
+CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, x);
+CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, y);
+CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, z);
+CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, intensity);
+CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, return_type);
+CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, channel);
+
+class CudaPointcloudPreprocessorNode : public rclcpp::Node
+{
+public:
+  explicit CudaPointcloudPreprocessorNode(const rclcpp::NodeOptions & node_options);
+  ~CudaPointcloudPreprocessorNode() = default;
+
+private:
+  bool getTransform(
+    const std::string & target_frame, const std::string & source_frame,
+    tf2::Transform * tf2_transform_ptr);
+
+  // Callback
+  void pointcloudCallback(const std::shared_ptr<const cuda_blackboard::CudaPointCloud2> cuda_msg);
+  void twistCallback(
+    const geometry_msgs::msg::TwistWithCovarianceStamped::ConstSharedPtr pointcloud_msg);
+  void imuCallback(const sensor_msgs::msg::Imu::ConstSharedPtr imu_msg);
+
+  tf2_ros::Buffer tf2_buffer_;
+  tf2_ros::TransformListener tf2_listener_;
+
+  std::string base_frame_;
+  bool imu_tranform_valid_{false};
+  std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> twist_queue_;
+  std::deque<geometry_msgs::msg::Vector3Stamped> angular_velocity_queue_;
+
+  // Subscriber
+  rclcpp::Subscription<geometry_msgs::msg::TwistWithCovarianceStamped>::SharedPtr twist_sub_{};
+  rclcpp::Subscription<sensor_msgs::msg::Imu>::SharedPtr imu_sub_{};
+
+  // CUDA pub & sub
+  std::unique_ptr<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>> pub_;
+  std::unique_ptr<cuda_blackboard::CudaBlackboardSubscriber<cuda_blackboard::CudaPointCloud2>> sub_;
+
+  std::unique_ptr<CudaPointcloudPreprocessor> cuda_pointcloud_preprocessor_;
+
+  std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_;
+  std::unique_ptr<autoware::universe_utils::DebugPublisher> debug_publisher_;
+};
+
+}  // namespace autoware::cuda_pointcloud_preprocessor
+
+#endif  // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_NODE_HPP_
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp
new file mode 100644
index 0000000000000..50010ac0f2b67
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp
@@ -0,0 +1,50 @@
+
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__POINT_TYPES_HPP_
+#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__POINT_TYPES_HPP_
+
+#include <cstdint>
+
+namespace autoware::cuda_pointcloud_preprocessor
+{
+
+// Note: We can not use PCL nor uniform initialization here because of thrust
+struct OutputPointType
+{
+  float x;
+  float y;
+  float z;
+  std::uint8_t intensity;
+  std::uint8_t return_type;
+  std::uint16_t channel;
+};
+
+struct InputPointType
+{
+  float x;
+  float y;
+  float z;
+  std::uint8_t intensity;
+  std::uint8_t return_type;
+  std::uint16_t channel;
+  float azimuth;
+  float elevation;
+  float distance;
+  std::uint32_t time_stamp;
+};
+
+}  // namespace autoware::cuda_pointcloud_preprocessor
+
+#endif  // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__POINT_TYPES_HPP_
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml b/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml
new file mode 100644
index 0000000000000..e8843bd804bf0
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml
@@ -0,0 +1,16 @@
+<launch>
+  <arg name="input/pointcloud" default="/sensing/lidar/top/pointcloud_raw_ex"/>
+  <arg name="input/imu" default="/sensing/imu/imu_data"/>
+  <arg name="input/twist" default="/sensing/vehicle_velocity_converter/twist_with_covariance"/>
+  <arg name="output/pointcloud" default="/sensing/lidar/top/test"/>
+
+  <arg name="cuda_pointcloud_preprocessor_param_file" default="$(find-pkg-share cuda_pointcloud_preprocessor)/config/cuda_pointcloud_preprocessor.param.yaml"/>
+
+  <node pkg="cuda_pointcloud_preprocessor" exec="cuda_pointcloud_preprocessor_node" name="cuda_pointcloud_preprocessor" output="screen">
+    <remap from="~/input/pointcloud" to="$(var input/pointcloud)"/>
+    <remap from="~/input/imu" to="$(var input/imu)"/>
+    <remap from="~/input/twist" to="$(var input/twist)"/>
+    <remap from="~/output/pointcloud" to="$(var output/pointcloud)"/>
+    <param from="$(var cuda_pointcloud_preprocessor_param_file)"/>
+  </node>
+</launch>
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/package.xml b/sensing/autoware_cuda_pointcloud_preprocessor/package.xml
new file mode 100644
index 0000000000000..edbd7ff61c874
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/package.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+<package format="3">
+  <name>autoware_cuda_pointcloud_preprocessor</name>
+  <version>0.1.0</version>
+  <description>autoware_cuda_pointcloud_preprocessor</description>
+  <maintainer email="kenzo.lobos@tier4.jp">Kenzo Lobos-Tsunekawa</maintainer>
+  <author email="kenzo.lobos@tier4.jp">Kenzo Lobos-Tsunekawa</author>
+  <license>Apache License 2.0</license>
+
+  <buildtool_depend>ament_cmake_auto</buildtool_depend>
+  <buildtool_depend>autoware_cmake</buildtool_depend>
+
+  <depend>autoware_point_types</depend>
+  <depend>autoware_universe_utils</depend>
+  <depend>cuda_blackboard</depend>
+  <depend>pcl_conversions</depend>
+  <depend>pcl_ros</depend>
+  <depend>rclcpp</depend>
+  <depend>rclcpp_components</depend>
+  <depend>sensor_msgs</depend>
+  <depend>sophus</depend>
+  <depend>tf2</depend>
+  <depend>tier4_debug_msgs</depend>
+
+  <test_depend>ament_clang_format</test_depend>
+  <test_depend>ament_lint_auto</test_depend>
+  <test_depend>ament_lint_common</test_depend>
+
+  <export>
+    <build_type>ament_cmake</build_type>
+  </export>
+</package>
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json
new file mode 100644
index 0000000000000..196f455ffe75b
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json
@@ -0,0 +1,140 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Parameters for the cuda_pointcloud_preprocessor",
+  "type": "object",
+  "definitions": {
+    "crop_box_filter": {
+      "type": "object",
+      "properties": {
+        "base_frame": {
+          "type": "string",
+          "description": "The undistortion algorithm is based on a base frame, which must be the same as the twist frame.",
+          "default": "base_link"
+        },
+        "use_imu": {
+          "type": "boolean",
+          "description": "Use IMU angular velocity, otherwise, use twist angular velocity.",
+          "default": "true"
+        },
+        "use_3d_distortion_correction": {
+          "type": "boolean",
+          "description": "Use 3d distortion correction algorithm, otherwise, use 2d distortion correction algorithm.",
+          "default": "false"
+        },
+        "distance_ratio": {
+          "type": "number",
+          "description": "distance_ratio",
+          "default": "1.03",
+          "minimum": 0.0
+        },
+        "object_length_threshold": {
+          "type": "number",
+          "description": "object_length_threshold",
+          "default": "0.1",
+          "minimum": 0.0
+        },
+        "num_points_threshold": {
+          "type": "integer",
+          "description": "num_points_threshold",
+          "default": "4",
+          "minimum": 0
+        },
+        "self_crop.min_x": {
+          "type": "number",
+          "description": "minimum x-coordinate value for self cropping in meters",
+          "default": "-1.0"
+        },
+        "self_crop.min_y": {
+          "type": "number",
+          "description": "minimum y-coordinate value for self cropping in meters",
+          "default": "-1.0"
+        },
+        "self_crop.min_z": {
+          "type": "number",
+          "description": "minimum z-coordinate value for self cropping in meters",
+          "default": "-1.0"
+        },
+        "self_crop.max_x": {
+          "type": "number",
+          "description": "maximum x-coordinate value for self cropping in meters",
+          "default": "1.0"
+        },
+        "self_crop.max_y": {
+          "type": "number",
+          "description": "maximum y-coordinate value for self cropping in meters",
+          "default": "1.0"
+        },
+        "self_crop.max_z": {
+          "type": "number",
+          "description": "maximum z-coordinate value for self cropping in meters",
+          "default": "1.0"
+        },
+        "mirror_crop.min_x": {
+          "type": "number",
+          "description": "minimum x-coordinate value for mirror cropping in meters",
+          "default": "-1.0"
+        },
+        "mirror_crop.min_y": {
+          "type": "number",
+          "description": "minimum y-coordinate value for mirror cropping in meters",
+          "default": "-1.0"
+        },
+        "mirror_crop.min_z": {
+          "type": "number",
+          "description": "minimum z-coordinate value for mirror cropping in meters",
+          "default": "-1.0"
+        },
+        "mirror_crop.max_x": {
+          "type": "number",
+          "description": "maximum x-coordinate value for mirror cropping in meters",
+          "default": "1.0"
+        },
+        "mirror_crop.max_y": {
+          "type": "number",
+          "description": "maximum y-coordinate value for mirror cropping in meters",
+          "default": "1.0"
+        },
+        "mirror_crop.max_z": {
+          "type": "number",
+          "description": "maximum z-coordinate value for mirror cropping in meters",
+          "default": "1.0"
+        }
+      },
+      "required": [
+        "base_frame",
+        "use_imu",
+        "use_3d_distortion_correction",
+        "distance_ratio",
+        "object_length_threshold",
+        "num_points_threshold",
+        "self_crop.min_x",
+        "self_crop.min_y",
+        "self_crop.min_z",
+        "self_crop.max_x",
+        "self_crop.max_y",
+        "self_crop.max_z",
+        "mirror_crop.min_x",
+        "mirror_crop.min_y",
+        "mirror_crop.min_z",
+        "mirror_crop.max_x",
+        "mirror_crop.max_y",
+        "mirror_crop.max_z"
+      ],
+      "additionalProperties": false
+    }
+  },
+  "properties": {
+    "/**": {
+      "type": "object",
+      "properties": {
+        "ros__parameters": {
+          "$ref": "#/definitions/cuda_pointcloud_preprocessor"
+        }
+      },
+      "required": ["ros__parameters"],
+      "additionalProperties": false
+    }
+  },
+  "required": ["/**"],
+  "additionalProperties": false
+}
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
new file mode 100644
index 0000000000000..bd5717fbbcb5e
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
@@ -0,0 +1,203 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp"
+
+#include <cuda_runtime.h>
+
+#include <vector>
+
+namespace autoware::cuda_organized_pointcloud_adapter
+{
+using sensor_msgs::msg::PointCloud2;
+
+CudaOrganizedPointcloudAdapterNode::CudaOrganizedPointcloudAdapterNode(
+  const rclcpp::NodeOptions & node_options)
+: Node("cuda_organized_pointcloud_adapter", node_options)
+{
+  pub_ =
+    std::make_unique<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>>(
+      *this, "~/output/pointcloud");
+
+  rclcpp::SubscriptionOptions sub_options;
+  sub_options.use_intra_process_comm = rclcpp::IntraProcessSetting::Enable;
+
+  pointcloud_sub_ = this->create_subscription<sensor_msgs::msg::PointCloud2>(
+    "~/input/pointcloud", rclcpp::SensorDataQoS{}.keep_last(1),
+    std::bind(&CudaOrganizedPointcloudAdapterNode::pointcloudCallback, this, std::placeholders::_1),
+    sub_options);
+
+  // initialize debug tool
+  {
+    using autoware::universe_utils::DebugPublisher;
+    using autoware::universe_utils::StopWatch;
+
+    stop_watch_ptr_ = std::make_unique<StopWatch<std::chrono::milliseconds>>();
+    debug_publisher_ = std::make_unique<DebugPublisher>(this, "cuda_organized_pointcloud_adapter");
+    stop_watch_ptr_->tic("processing_time");
+  }
+}
+
+void CudaOrganizedPointcloudAdapterNode::estimatePointcloudRingInfo(
+  const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr)
+{
+  const autoware::point_types::PointXYZIRCAEDT * input_buffer =
+    reinterpret_cast<const autoware::point_types::PointXYZIRCAEDT *>(
+      input_pointcloud_msg_ptr->data.data());
+
+  std::size_t max_ring = 0;
+
+  for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
+       i++) {
+    const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i];
+    const std::size_t ring = static_cast<std::size_t>(point.channel);
+    max_ring = std::max(max_ring, ring);
+  }
+
+  // Set max rings to the next power of two
+  num_rings_ = std::pow(2, std::ceil(std::log2(max_ring + 1)));
+  num_rings_ = std::max(num_rings_, static_cast<std::size_t>(16));
+  std::vector<std::size_t> ring_points(num_rings_, 0);
+
+  for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
+       i++) {
+    const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i];
+    const std::size_t ring = point.channel;
+    ring_points[ring]++;
+  }
+
+  // Set max points per ring to the next multiple of 512
+  max_points_per_ring_ = *std::max_element(ring_points.begin(), ring_points.end());
+  max_points_per_ring_ = std::max(max_points_per_ring_, static_cast<std::size_t>(512));
+  max_points_per_ring_ = (max_points_per_ring_ + 511) / 512 * 512;
+
+  next_ring_index_.resize(num_rings_);
+  std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0);
+  buffer_.resize(num_rings_ * max_points_per_ring_);
+
+  /* if (device_buffer_ != nullptr) {
+    cudaFree(device_buffer_);
+  } */
+
+  device_buffer_ = cuda_blackboard::make_unique<std::uint8_t[]>(
+    num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT));
+
+  /* cudaMalloc(
+    device_buffer_,
+    num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); */
+
+  RCLCPP_INFO_STREAM(
+    get_logger(), "Estimated rings: " << num_rings_
+                                      << ", max_points_per_ring: " << max_points_per_ring_
+                                      << ". This should only be done during the first iterations. "
+                                         "Otherwise, performance will be affected.");
+}
+
+bool CudaOrganizedPointcloudAdapterNode::orderPointcloud(
+  const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr)
+{
+  const autoware::point_types::PointXYZIRCAEDT * input_buffer =
+    reinterpret_cast<const autoware::point_types::PointXYZIRCAEDT *>(
+      input_pointcloud_msg_ptr->data.data());
+
+  bool ring_overflow = false;
+  bool point_overflow = false;
+
+  for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
+       i++) {
+    const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i];
+    const std::size_t raw_ring = point.channel;
+    const std::size_t ring = raw_ring % num_rings_;
+
+    const std::size_t raw_index = next_ring_index_[ring];
+    const std::size_t index = raw_index % max_points_per_ring_;
+
+    ring_overflow |= raw_ring >= num_rings_;
+    point_overflow |= raw_index >= max_points_per_ring_;
+
+    buffer_[ring * max_points_per_ring_ + index] = point;
+    next_ring_index_[ring] = (index + 1) % max_points_per_ring_;
+  }
+
+  return !ring_overflow && !point_overflow;
+}
+
+void CudaOrganizedPointcloudAdapterNode::pointcloudCallback(
+  const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr)
+{
+  stop_watch_ptr_->toc("processing_time", true);
+
+  // TODO(knzo25): check the pointcloud layout at least once
+
+  assert(input_pointcloud_msg_ptr->point_step == sizeof(autoware::point_types::PointXYZIRCAEDT));
+
+  if (num_rings_ == 0 || max_points_per_ring_ == 0) {
+    estimatePointcloudRingInfo(input_pointcloud_msg_ptr);
+  }
+
+  if (!orderPointcloud(input_pointcloud_msg_ptr)) {
+    estimatePointcloudRingInfo(input_pointcloud_msg_ptr);
+    orderPointcloud(input_pointcloud_msg_ptr);
+  }
+
+  // Copy to cuda memory
+  cudaMemcpy(
+    device_buffer_.get(), buffer_.data(),
+    num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT),
+    cudaMemcpyHostToDevice);
+
+  auto cuda_pointcloud_msg_ptr = std::make_unique<cuda_blackboard::CudaPointCloud2>();
+  cuda_pointcloud_msg_ptr->width = max_points_per_ring_;
+  cuda_pointcloud_msg_ptr->height = num_rings_;
+  cuda_pointcloud_msg_ptr->point_step = sizeof(autoware::point_types::PointXYZIRCAEDT);
+  cuda_pointcloud_msg_ptr->row_step =
+    max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT);
+  cuda_pointcloud_msg_ptr->data =
+    std::move(device_buffer_); /*reinterpret_cast<uint8_t *>(device_buffer_)*/
+  ;
+  cuda_pointcloud_msg_ptr->is_dense = input_pointcloud_msg_ptr->is_dense;
+  cuda_pointcloud_msg_ptr->header = input_pointcloud_msg_ptr->header;
+
+  pub_->publish(std::move(cuda_pointcloud_msg_ptr));
+
+  if (debug_publisher_) {
+    const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true);
+    debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>(
+      "debug/processing_time_ms", processing_time_ms);
+
+    double now_stamp_seconds = rclcpp::Time(this->get_clock()->now()).seconds();
+    double cloud_stamp_seconds = rclcpp::Time(input_pointcloud_msg_ptr->header.stamp).seconds();
+
+    debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>(
+      "debug/latency_ms", 1000.f * (now_stamp_seconds - cloud_stamp_seconds));
+  }
+
+  // Allocate cuda memory
+  device_buffer_ = cuda_blackboard::make_unique<std::uint8_t[]>(
+    num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT));
+  /* cudaMalloc(
+    &device_buffer_,
+    num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); */
+  // Clear indexes
+  std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0);
+
+  // Clear pointcloud buffer
+  std::fill(buffer_.begin(), buffer_.end(), autoware::point_types::PointXYZIRCAEDT{});
+}
+
+}  // namespace autoware::cuda_organized_pointcloud_adapter
+
+#include "rclcpp_components/register_node_macro.hpp"
+RCLCPP_COMPONENTS_REGISTER_NODE(
+  autoware::cuda_organized_pointcloud_adapter::CudaOrganizedPointcloudAdapterNode)
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
new file mode 100644
index 0000000000000..ad8910aef5b76
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -0,0 +1,692 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
+
+#include <Eigen/Core>
+#include <Eigen/Dense>
+#include <Eigen/Geometry>
+
+#include <cuda_runtime.h>
+#include <tf2/utils.h>
+#include <thrust/execution_policy.h>
+#include <thrust/host_vector.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+
+namespace autoware::cuda_pointcloud_preprocessor
+{
+
+__host__ __device__ Eigen::Matrix3f skewSymmetric(const Eigen::Vector3f & v)
+{
+  Eigen::Matrix3f m;
+  m << 0, -v.z(), v.y(), v.z(), 0, -v.x(), -v.y(), v.x(), 0;
+  return m;
+}
+
+__host__ __device__ Eigen::Matrix3f leftJacobianSO3(const Eigen::Vector3f & omega)
+{
+  double theta = omega.norm();
+  if (std::abs(theta) < 1e-6) {
+    return Eigen::Matrix3f::Identity();
+  }
+
+  Eigen::Matrix3f Omega = skewSymmetric(omega);
+
+  Eigen::Matrix3f Omega2 = Omega * Omega;
+  double theta2 = theta * theta;
+  double theta3 = theta2 * theta;
+
+  // Rodrigues' formula for Jacobian
+  return Eigen::Matrix3f::Identity() + ((1 - cos(theta)) / theta2) * Omega +
+         ((theta - sin(theta)) / theta3) * Omega2;
+}
+
+__host__ __device__ Eigen::Matrix4f transformationMatrixFromVelocity(
+  const Eigen::Vector3f & linear_velocity, const Eigen::Vector3f & angular_velocity, double dt)
+{
+  Eigen::Matrix3f R = Eigen::AngleAxisf(angular_velocity.norm() * dt, angular_velocity.normalized())
+                        .toRotationMatrix();
+  Eigen::Matrix3f J = leftJacobianSO3(angular_velocity * dt);
+
+  Eigen::Vector3f translation = J * (linear_velocity * dt);
+
+  Eigen::Matrix4f transformation = Eigen::Matrix4f::Identity();
+  transformation.block<3, 3>(0, 0) = R;
+  transformation.block<3, 1>(0, 3) = translation;
+
+  return transformation;
+}
+
+__global__ void transformPointsKernel(
+  const InputPointType * input_points, InputPointType * output_points, int num_points,
+  TransformStruct transform)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_points) {
+    output_points[idx] = input_points[idx];
+
+    const float x = input_points[idx].x;
+    const float y = input_points[idx].y;
+    const float z = input_points[idx].z;
+
+    output_points[idx].x = transform.m11 * x + transform.m12 * y + transform.m13 * z + transform.x;
+    output_points[idx].y = transform.m21 * x + transform.m22 * y + transform.m23 * z + transform.y;
+    output_points[idx].z = transform.m31 * x + transform.m32 * y + transform.m33 * z + transform.z;
+  }
+}
+
+__global__ void cropBoxKernel(
+  InputPointType * d_points, uint32_t * output_mask, int num_points, float min_x, float min_y,
+  float min_z, float max_x, float max_y, float max_z)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_points) {
+    const float x = d_points[idx].x;
+    const float y = d_points[idx].y;
+    const float z = d_points[idx].z;
+
+    if (x > min_x && x < max_x && y > min_y && y < max_y && z > min_z && z < max_z) {
+      output_mask[idx] = 0;
+    } else {
+      output_mask[idx] = 1;
+    }
+  }
+}
+
+__global__ void combineMasksKernel(
+  uint32_t * mask1, uint32_t * mask2, uint32_t * mask3, int num_points, uint32_t * output_mask)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_points) {
+    output_mask[idx] = mask1[idx] & mask2[idx] & mask3[idx];
+  }
+}
+
+__global__ void extractInputPointIndicesKernel(
+  InputPointType * input_points, uint32_t * masks, uint32_t * indices, int num_points,
+  InputPointType * output_points)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_points && masks[idx] == 1) {
+    output_points[indices[idx] - 1] = input_points[idx];
+  }
+}
+
+__global__ void extractOutputPointIndicesKernel(
+  OutputPointType * input_points, uint32_t * masks, uint32_t * indices, int num_points,
+  OutputPointType * output_points)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_points && masks[idx] == 1) {
+    output_points[indices[idx] - 1] = input_points[idx];
+  }
+}
+
+__global__ void extractInputPointsToOutputPoints_indicesKernel(
+  InputPointType * input_points, uint32_t * masks, uint32_t * indices, int num_points,
+  OutputPointType * output_points)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_points && masks[idx] == 1) {
+    InputPointType & input_point = input_points[idx];
+    OutputPointType & output_point = output_points[indices[idx] - 1];
+    output_point.x = input_point.x;
+    output_point.y = input_point.y;
+    output_point.z = input_point.z;
+    output_point.intensity = input_point.intensity;
+    output_point.return_type = input_point.return_type;
+    output_point.channel = input_point.channel;
+  }
+}
+
+__global__ void undistort2dKernel(
+  InputPointType * input_points, int num_points, TwistStruct2D * twist_structs, int num_twists)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_points) {
+    InputPointType & point = input_points[idx];
+
+    // The twist must always be newer than the point ! (or it was the last twist)
+    int twist_index = 0;
+    while (twist_index < num_twists && twist_structs[twist_index].stamp_nsec < point.time_stamp) {
+      twist_index++;
+    }
+
+    twist_index = min(twist_index, num_twists - 1);
+
+    TwistStruct2D twist = twist_structs[twist_index];
+    float x = twist.cum_x;
+    float y = twist.cum_y;
+    float theta = twist.cum_theta;
+
+    double dt_nsec =
+      point.time_stamp > twist.last_stamp_nsec ? point.time_stamp - twist.last_stamp_nsec : 0;
+    double dt = 1e-9 * (dt_nsec);
+
+    theta += twist.vtheta * dt;
+    float d = twist.vx * dt;
+    x += d * cos(theta);
+    y += d * sin(theta);
+
+    float distorted_x = point.x;
+    float distorted_y = point.y;
+
+    point.x = distorted_x * cos(theta) - distorted_y * sin(theta) + x;
+    point.y = distorted_x * sin(theta) + distorted_y * cos(theta) + y;
+  }
+}
+
+__global__ void undistort3dKernel(
+  InputPointType * input_points, int num_points, TwistStruct3D * twist_structs, int num_twists)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_points) {
+    InputPointType & point = input_points[idx];
+
+    // The twist must always be newer than the point ! (or it was the last twist)
+    int twist_index = 0;
+    while (twist_index < num_twists && twist_structs[twist_index].stamp_nsec < point.time_stamp) {
+      twist_index++;
+    }
+
+    twist_index = min(twist_index, num_twists - 1);
+
+    TwistStruct3D twist = twist_structs[twist_index];
+    Eigen::Map<Eigen::Matrix4f> cum_transform_buffer_map(twist.cum_transform_buffer);
+    Eigen::Map<Eigen::Vector3f> v_map(twist.v);
+    Eigen::Map<Eigen::Vector3f> w_map(twist.w);
+
+    double dt_nsec =
+      point.time_stamp > twist.last_stamp_nsec ? point.time_stamp - twist.last_stamp_nsec : 0;
+    double dt = 1e-9 * (dt_nsec);
+
+    Eigen::Matrix4f transform =
+      cum_transform_buffer_map * transformationMatrixFromVelocity(v_map, w_map, dt);
+    Eigen::Vector3f p(point.x, point.y, point.z);
+    p = transform.block<3, 3>(0, 0) * p + transform.block<3, 1>(0, 3);
+
+    point.x = p.x();
+    point.y = p.y();
+    point.z = p.z();
+  }
+}
+
+__global__ void ringOutlierFilterKernel(
+  const InputPointType * d_points, uint32_t * output_mask, int num_rings, int max_points_per_ring,
+  float distance_ratio, float object_length_threshold_squared, int num_points_threshold)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = idx / max_points_per_ring;
+  int i = idx % max_points_per_ring;
+
+  if (j >= num_rings || i >= max_points_per_ring) {
+    return;
+  }
+
+  int min_i = max(i - num_points_threshold, 0);
+  int max_i = min(i + num_points_threshold, max_points_per_ring);
+
+  int walk_size = 1;
+  int left_idx = min_i;
+  int right_idx = min_i + 1;
+
+  for (int k = min_i; k < max_i - 1; k++) {
+    const InputPointType & left_point = d_points[j * max_points_per_ring + k];
+    const InputPointType & right_point = d_points[j * max_points_per_ring + k + 1];
+
+    // Find biggest walk that passes through i
+    float azimuth_diff = right_point.azimuth - left_point.azimuth;
+    azimuth_diff = azimuth_diff < 0.f ? azimuth_diff + 2 * M_PI : azimuth_diff;
+
+    if (
+      max(left_point.distance, right_point.distance) <
+        min(left_point.distance, right_point.distance) * distance_ratio &&
+      azimuth_diff < 1.f * M_PI / 180.f) {
+      // Determined to be included in the same walk
+      walk_size++;
+      right_idx++;
+    } else if (k >= i) {
+      break;
+    } else {
+      walk_size = 1;
+      left_idx = k + 1;
+      right_idx = k + 2;  // this is safe since we break if k >= i
+    }
+  }
+
+  const InputPointType & left_point = d_points[j * max_points_per_ring + left_idx];
+  const InputPointType & right_point = d_points[j * max_points_per_ring + right_idx - 1];
+  const float x = left_point.x - right_point.x;
+  const float y = left_point.y - right_point.y;
+  const float z = left_point.z - right_point.z;
+
+  output_mask[j * max_points_per_ring + i] =
+    ((walk_size > num_points_threshold) ||
+     (x * x + y * y + z * z >= object_length_threshold_squared))
+      ? 1
+      : 0;
+}
+
+__global__ void transformPointTypeKernel(
+  const InputPointType * device_input_points, int num_points, OutputPointType * device_ouput_points)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_points) {
+    const InputPointType & input_point = device_input_points[idx];
+    OutputPointType & output_point = device_ouput_points[idx];
+
+    output_point.x = input_point.x;
+    output_point.y = input_point.y;
+    output_point.z = input_point.z;
+    output_point.intensity = (float)input_point.intensity;
+  }
+}
+
+CudaPointcloudPreprocessor::CudaPointcloudPreprocessor()
+{
+  sensor_msgs::msg::PointField x_field, y_field, z_field, intensity_field, return_type_field,
+    channel_field, azimuth_field, elevation_field, distance_field, time_stamp_field;
+  x_field.name = "x";
+  x_field.offset = 0;
+  x_field.datatype = sensor_msgs::msg::PointField::FLOAT32;
+  x_field.count = 1;
+
+  y_field.name = "y";
+  y_field.offset = 4;
+  y_field.datatype = sensor_msgs::msg::PointField::FLOAT32;
+  y_field.count = 1;
+
+  z_field.name = "z";
+  z_field.offset = 8;
+  z_field.datatype = sensor_msgs::msg::PointField::FLOAT32;
+  z_field.count = 1;
+
+  intensity_field.name = "intensity";
+  intensity_field.offset = 12;
+  intensity_field.datatype = sensor_msgs::msg::PointField::UINT8;
+  intensity_field.count = 1;
+
+  return_type_field.name = "return_type";
+  return_type_field.offset = 13;
+  return_type_field.datatype = sensor_msgs::msg::PointField::UINT8;
+  return_type_field.count = 1;
+
+  channel_field.name = "channel";
+  channel_field.offset = 14;
+  channel_field.datatype = sensor_msgs::msg::PointField::UINT16;
+  channel_field.count = 1;
+
+  static_assert(sizeof(OutputPointType) == 16, "OutputPointType size is not 16 bytes");
+  static_assert(offsetof(OutputPointType, x) == 0);
+  static_assert(offsetof(OutputPointType, y) == 4);
+  static_assert(offsetof(OutputPointType, z) == 8);
+  static_assert(offsetof(OutputPointType, intensity) == 12);
+  static_assert(offsetof(OutputPointType, return_type) == 13);
+  static_assert(offsetof(OutputPointType, channel) == 14);
+
+  point_fields_.push_back(x_field);
+  point_fields_.push_back(y_field);
+  point_fields_.push_back(z_field);
+  point_fields_.push_back(intensity_field);
+  point_fields_.push_back(return_type_field);
+  point_fields_.push_back(channel_field);
+}
+
+void CudaPointcloudPreprocessor::setCropBoxParameters(
+  const CropBoxParameters & self_crop_box_parameters,
+  const CropBoxParameters & mirror_crop_box_parameters)
+{
+  self_crop_box_parameters_ = self_crop_box_parameters;
+  mirror_crop_box_parameters_ = mirror_crop_box_parameters;
+}
+
+void CudaPointcloudPreprocessor::setRingOutlierFilterParameters(
+  const RingOutlierFilterParameters & ring_outlier_parameters)
+{
+  ring_outlier_parameters_ = ring_outlier_parameters;
+}
+
+void CudaPointcloudPreprocessor::set3DUndistortion(bool use_3d_undistortion)
+{
+  use_3d_undistortion_ = use_3d_undistortion;
+}
+
+void CudaPointcloudPreprocessor::preallocateOutput()
+{
+  output_pointcloud_ptr_ = std::make_unique<cuda_blackboard::CudaPointCloud2>();
+  output_pointcloud_ptr_->data = cuda_blackboard::make_unique<std::uint8_t[]>(
+    max_rings_ * max_points_per_ring_ * sizeof(OutputPointType));
+}
+
+void CudaPointcloudPreprocessor::setupTwist2DStructs(
+  const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg,
+  const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
+  const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue)
+{
+  const InputPointType * device_input_points =
+    reinterpret_cast<const InputPointType *>(input_pointcloud_msg.data.get());
+  InputPointType first_point;
+  cudaMemcpy(&first_point, &device_input_points[0], sizeof(InputPointType), cudaMemcpyDeviceToHost);
+
+  // Twist preprocessing
+
+  uint64_t pointcloud_stamp_nsec = 1'000'000'000 * input_pointcloud_msg.header.stamp.sec +
+                                   input_pointcloud_msg.header.stamp.nanosec;
+
+  thrust::host_vector<TwistStruct2D> host_twist_structs;
+
+  float cum_x = 0;
+  float cum_y = 0;
+  float cum_theta = 0;
+  // All time stamps from now on are in nsec from the "beginning of the pointcloud"
+  uint32_t last_stamp_nsec = first_point.time_stamp;
+
+  std::size_t twist_index = 0;
+  std::size_t angular_velocity_index = 0;
+
+  for (; twist_index < twist_queue.size() ||
+         angular_velocity_index < angular_velocity_queue.size();) {
+    uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec;
+    float vx, vtheta;
+
+    if (twist_index < twist_queue.size()) {
+      input_twist_global_stamp_nsec =
+        1'000'000'000 * static_cast<uint64_t>(twist_queue[twist_index].header.stamp.sec) +
+        static_cast<uint64_t>(twist_queue[twist_index].header.stamp.nanosec);
+      vx = twist_queue[twist_index].twist.twist.linear.x;
+    } else {
+      input_twist_global_stamp_nsec = std::numeric_limits<uint64_t>::max();
+      vx = 0.0;
+    }
+
+    if (angular_velocity_index < angular_velocity_queue.size()) {
+      angular_velocity_global_stamp_nsec =
+        1'000'000'000 *
+          static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.sec) +
+        static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.nanosec);
+      vtheta = angular_velocity_queue[angular_velocity_index].vector.z;
+    } else {
+      angular_velocity_global_stamp_nsec = std::numeric_limits<uint64_t>::max();
+      vtheta = 0.0;
+    }
+
+    if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) {
+      twist_stamp = input_twist_global_stamp_nsec;
+      twist_index++;
+    } else if (input_twist_global_stamp_nsec > angular_velocity_global_stamp_nsec) {
+      twist_stamp = angular_velocity_global_stamp_nsec;
+      angular_velocity_index++;
+    } else {
+      twist_index++;
+      angular_velocity_index++;
+    }
+
+    TwistStruct2D twist;
+    twist.cum_x = cum_x;
+    twist.cum_y = cum_y;
+    twist.cum_theta = cum_theta;
+
+    uint64_t twist_global_stamp_nsec = twist_stamp;
+    assert(twist_global_stamp_nsec > pointcloud_stamp_nsec);  // by construction
+    uint32_t twist_from_pointcloud_start_nsec = twist_global_stamp_nsec - pointcloud_stamp_nsec;
+
+    twist.stamp_nsec = twist_from_pointcloud_start_nsec;
+    twist.vx = vx;
+    twist.vtheta = vtheta;
+    twist.last_stamp_nsec = last_stamp_nsec;
+    host_twist_structs.push_back(twist);
+
+    double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec);
+    last_stamp_nsec = twist.stamp_nsec;
+    cum_theta += vtheta * dt_seconds;
+    float d = twist.vx * dt_seconds;
+    cum_x += d * cos(cum_theta);
+    cum_y += d * sin(cum_theta);
+  }
+
+  // Copy to device
+  device_twist_2d_structs_ = host_twist_structs;
+}
+
+void CudaPointcloudPreprocessor::setupTwist3DStructs(
+  const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg,
+  const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
+  const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue)
+{
+  const InputPointType * device_input_points =
+    reinterpret_cast<const InputPointType *>(input_pointcloud_msg.data.get());
+  InputPointType first_point;
+  cudaMemcpy(&first_point, &device_input_points[0], sizeof(InputPointType), cudaMemcpyDeviceToHost);
+
+  // Twist preprocessing
+
+  uint64_t pointcloud_stamp_nsec = 1'000'000'000 * input_pointcloud_msg.header.stamp.sec +
+                                   input_pointcloud_msg.header.stamp.nanosec;
+
+  thrust::host_vector<TwistStruct3D> host_twist_structs;
+
+  Eigen::Matrix4f cum_transform = Eigen::Matrix4f::Identity();
+  Eigen::Vector3f v = Eigen::Vector3f::Zero();
+  Eigen::Vector3f w = Eigen::Vector3f::Zero();
+
+  // All time stamps from now on are in nsec from the "beginning of the pointcloud"
+  uint32_t last_stamp_nsec = first_point.time_stamp;
+
+  std::size_t twist_index = 0;
+  std::size_t angular_velocity_index = 0;
+
+  for (; twist_index < twist_queue.size() ||
+         angular_velocity_index < angular_velocity_queue.size();) {
+    uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec;
+
+    if (twist_index < twist_queue.size()) {
+      input_twist_global_stamp_nsec =
+        1'000'000'000 * static_cast<uint64_t>(twist_queue[twist_index].header.stamp.sec) +
+        static_cast<uint64_t>(twist_queue[twist_index].header.stamp.nanosec);
+      v.x() = twist_queue[twist_index].twist.twist.linear.x;
+      v.y() = twist_queue[twist_index].twist.twist.linear.y;
+      v.z() = twist_queue[twist_index].twist.twist.linear.z;
+    } else {
+      input_twist_global_stamp_nsec = std::numeric_limits<uint64_t>::max();
+      v = Eigen::Vector3f::Zero();
+    }
+
+    if (angular_velocity_index < angular_velocity_queue.size()) {
+      angular_velocity_global_stamp_nsec =
+        1'000'000'000 *
+          static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.sec) +
+        static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.nanosec);
+      w.x() = angular_velocity_queue[angular_velocity_index].vector.x;
+      w.y() = angular_velocity_queue[angular_velocity_index].vector.y;
+      w.z() = angular_velocity_queue[angular_velocity_index].vector.z;
+    } else {
+      angular_velocity_global_stamp_nsec = std::numeric_limits<uint64_t>::max();
+      w = Eigen::Vector3f::Zero();
+    }
+
+    if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) {
+      twist_stamp = input_twist_global_stamp_nsec;
+      twist_index++;
+    } else if (input_twist_global_stamp_nsec > angular_velocity_global_stamp_nsec) {
+      twist_stamp = angular_velocity_global_stamp_nsec;
+      angular_velocity_index++;
+    } else {
+      twist_index++;
+      angular_velocity_index++;
+    }
+
+    TwistStruct3D twist;
+
+    Eigen::Map<Eigen::Matrix4f> cum_transform_buffer_map(twist.cum_transform_buffer);
+    Eigen::Map<Eigen::Vector3f> v_map(twist.v);
+    Eigen::Map<Eigen::Vector3f> w_map(twist.w);
+    cum_transform_buffer_map = cum_transform;
+
+    uint64_t twist_global_stamp_nsec = twist_stamp;
+    assert(twist_global_stamp_nsec > pointcloud_stamp_nsec);  // by construction
+    uint32_t twist_from_pointcloud_start_nsec = twist_global_stamp_nsec - pointcloud_stamp_nsec;
+
+    twist.stamp_nsec = twist_from_pointcloud_start_nsec;
+    v_map = v;
+    w_map = w;
+    twist.last_stamp_nsec = last_stamp_nsec;
+    host_twist_structs.push_back(twist);
+
+    double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec);
+    last_stamp_nsec = twist.stamp_nsec;
+
+    auto delta_transform = transformationMatrixFromVelocity(v, w, dt_seconds);
+    cum_transform = cum_transform * delta_transform;
+  }
+
+  // Copy to device
+  device_twist_3d_structs_ = host_twist_structs;
+}
+
+std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::process(
+  const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg,
+  const geometry_msgs::msg::TransformStamped & transform_msg,
+  const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
+  const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue)
+{
+  auto frame_id = input_pointcloud_msg.header.frame_id;
+
+  auto num_input_points = input_pointcloud_msg.width * input_pointcloud_msg.height;
+  if (
+    input_pointcloud_msg.width * input_pointcloud_msg.height > max_rings_ * max_points_per_ring_) {
+    max_rings_ = input_pointcloud_msg.height;
+    max_points_per_ring_ = input_pointcloud_msg.width;
+
+    device_transformed_points_.resize(num_input_points);
+    device_self_crop_mask_.resize(num_input_points);
+    device_mirror_crop_mask_.resize(num_input_points);
+    device_ring_outlier_mask_.resize(num_input_points);
+    device_indices_.resize(num_input_points);
+
+    preallocateOutput();
+  }
+
+  tf2::Quaternion rotation_quaternion(
+    transform_msg.transform.rotation.x, transform_msg.transform.rotation.y,
+    transform_msg.transform.rotation.z, transform_msg.transform.rotation.w);
+  tf2::Matrix3x3 rotation_matrix;
+  rotation_matrix.setRotation(rotation_quaternion);
+
+  TransformStruct transform_struct;
+  transform_struct.x = static_cast<float>(transform_msg.transform.translation.x);
+  transform_struct.y = static_cast<float>(transform_msg.transform.translation.y);
+  transform_struct.z = static_cast<float>(transform_msg.transform.translation.z);
+  transform_struct.m11 = static_cast<float>(rotation_matrix[0][0]);
+  transform_struct.m12 = static_cast<float>(rotation_matrix[0][1]);
+  transform_struct.m13 = static_cast<float>(rotation_matrix[0][2]);
+  transform_struct.m21 = static_cast<float>(rotation_matrix[1][0]);
+  transform_struct.m22 = static_cast<float>(rotation_matrix[1][1]);
+  transform_struct.m23 = static_cast<float>(rotation_matrix[1][2]);
+  transform_struct.m31 = static_cast<float>(rotation_matrix[2][0]);
+  transform_struct.m32 = static_cast<float>(rotation_matrix[2][1]);
+  transform_struct.m33 = static_cast<float>(rotation_matrix[2][2]);
+
+  // Twist preprocessing
+  const InputPointType * device_input_points =
+    reinterpret_cast<const InputPointType *>(input_pointcloud_msg.data.get());
+
+  if (use_3d_undistortion_) {
+    setupTwist3DStructs(input_pointcloud_msg, twist_queue, angular_velocity_queue);
+  } else {
+    setupTwist2DStructs(input_pointcloud_msg, twist_queue, angular_velocity_queue);
+  }
+
+  // Obtain raw pointers for the kernels
+  TwistStruct2D * device_twist_2d_structs =
+    thrust::raw_pointer_cast(device_twist_2d_structs_.data());
+  TwistStruct3D * device_twist_3d_structs =
+    thrust::raw_pointer_cast(device_twist_3d_structs_.data());
+  InputPointType * device_transformed_points =
+    thrust::raw_pointer_cast(device_transformed_points_.data());
+  uint32_t * device_self_crop_mask = thrust::raw_pointer_cast(device_self_crop_mask_.data());
+  uint32_t * device_mirror_crop_mask = thrust::raw_pointer_cast(device_mirror_crop_mask_.data());
+  uint32_t * device_ring_outlier_mask = thrust::raw_pointer_cast(device_ring_outlier_mask_.data());
+  uint32_t * device_indices = thrust::raw_pointer_cast(device_indices_.data());
+
+  const int threadsPerBlock = 256;
+  const int blocksPerGrid = (num_input_points + threadsPerBlock - 1) / threadsPerBlock;
+
+  transformPointsKernel<<<blocksPerGrid, threadsPerBlock>>>(
+    device_input_points, device_transformed_points, num_input_points, transform_struct);
+
+  cropBoxKernel<<<blocksPerGrid, threadsPerBlock>>>(
+    device_transformed_points, device_self_crop_mask, num_input_points,
+    self_crop_box_parameters_.min_x, self_crop_box_parameters_.min_y,
+    self_crop_box_parameters_.min_z, self_crop_box_parameters_.max_x,
+    self_crop_box_parameters_.max_y, self_crop_box_parameters_.max_z);
+
+  cropBoxKernel<<<blocksPerGrid, threadsPerBlock>>>(
+    device_transformed_points, device_mirror_crop_mask, num_input_points,
+    mirror_crop_box_parameters_.min_x, mirror_crop_box_parameters_.min_y,
+    mirror_crop_box_parameters_.min_z, mirror_crop_box_parameters_.max_x,
+    mirror_crop_box_parameters_.max_y, mirror_crop_box_parameters_.max_z);
+
+  if (use_3d_undistortion_ && device_twist_3d_structs_.size() > 0) {
+    undistort3dKernel<<<blocksPerGrid, threadsPerBlock>>>(
+      device_transformed_points, num_input_points, device_twist_3d_structs,
+      device_twist_3d_structs_.size());
+  } else if (!use_3d_undistortion_ && device_twist_2d_structs_.size() > 0) {
+    undistort2dKernel<<<blocksPerGrid, threadsPerBlock>>>(
+      device_transformed_points, num_input_points, device_twist_2d_structs,
+      device_twist_2d_structs_.size());
+  }
+
+  ringOutlierFilterKernel<<<blocksPerGrid, threadsPerBlock>>>(
+    device_transformed_points, device_ring_outlier_mask, max_rings_, max_points_per_ring_,
+    ring_outlier_parameters_.distance_ratio,
+    ring_outlier_parameters_.object_length_threshold *
+      ring_outlier_parameters_.object_length_threshold,
+    ring_outlier_parameters_.num_points_threshold);
+
+  combineMasksKernel<<<blocksPerGrid, threadsPerBlock>>>(
+    device_self_crop_mask, device_mirror_crop_mask, device_ring_outlier_mask, num_input_points,
+    device_ring_outlier_mask);
+
+  thrust::inclusive_scan(
+    thrust::device, device_ring_outlier_mask, device_ring_outlier_mask + num_input_points,
+    device_indices);
+
+  uint32_t num_output_points;
+  cudaMemcpy(
+    &num_output_points, device_indices + num_input_points - 1, sizeof(uint32_t),
+    cudaMemcpyDeviceToHost);
+
+  if (num_output_points > 0) {
+    extractInputPointsToOutputPoints_indicesKernel<<<blocksPerGrid, threadsPerBlock>>>(
+      device_transformed_points, device_ring_outlier_mask, device_indices, num_input_points,
+      reinterpret_cast<OutputPointType *>(output_pointcloud_ptr_->data.get()));
+  }
+
+  // Copy the transformed points back
+  output_pointcloud_ptr_->row_step = num_output_points * sizeof(OutputPointType);
+  output_pointcloud_ptr_->width = num_output_points;
+  output_pointcloud_ptr_->height = 1;
+
+  output_pointcloud_ptr_->fields = point_fields_;
+  output_pointcloud_ptr_->is_dense = true;
+  output_pointcloud_ptr_->is_bigendian = input_pointcloud_msg.is_bigendian;
+  output_pointcloud_ptr_->point_step = sizeof(OutputPointType);
+  output_pointcloud_ptr_->header.stamp = input_pointcloud_msg.header.stamp;
+
+  return std::move(output_pointcloud_ptr_);
+}
+
+}  // namespace autoware::cuda_pointcloud_preprocessor
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
new file mode 100644
index 0000000000000..fdfae020f1e19
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
@@ -0,0 +1,253 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp"
+
+#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
+
+#include <autoware/point_types/types.hpp>
+
+#include <geometry_msgs/msg/transform_stamped.hpp>
+#include <tf2_geometry_msgs/tf2_geometry_msgs.hpp>
+
+#include <cuda_runtime.h>
+#include <pcl/pcl_base.h>
+#include <pcl/point_cloud.h>
+#include <pcl/point_types.h>
+
+#include <chrono>
+#include <string>
+#include <vector>
+
+namespace autoware::cuda_pointcloud_preprocessor
+{
+using sensor_msgs::msg::PointCloud2;
+
+CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode(
+  const rclcpp::NodeOptions & node_options)
+: Node("cuda_pointcloud_preprocessor", node_options),
+  tf2_buffer_(this->get_clock()),
+  tf2_listener_(tf2_buffer_)
+{
+  using std::placeholders::_1;
+
+  // Parameters
+  base_frame_ = declare_parameter<std::string>("base_frame");
+
+  RingOutlierFilterParameters ring_outlier_filter_parameters;
+  ring_outlier_filter_parameters.distance_ratio = declare_parameter<float>("distance_ratio");
+  ring_outlier_filter_parameters.object_length_threshold =
+    declare_parameter<float>("object_length_threshold");
+  ring_outlier_filter_parameters.num_points_threshold =
+    declare_parameter<int>("num_points_threshold");
+
+  CropBoxParameters self_crop_box_parameters, mirror_crop_box_parameters;
+  self_crop_box_parameters.min_x = declare_parameter<float>("self_crop.min_x");
+  self_crop_box_parameters.min_y = declare_parameter<float>("self_crop.min_y");
+  self_crop_box_parameters.min_z = declare_parameter<float>("self_crop.min_z");
+  self_crop_box_parameters.max_x = declare_parameter<float>("self_crop.max_x");
+  self_crop_box_parameters.max_y = declare_parameter<float>("self_crop.max_y");
+  self_crop_box_parameters.max_z = declare_parameter<float>("self_crop.max_z");
+
+  mirror_crop_box_parameters.min_x = declare_parameter<float>("mirror_crop.min_x");
+  mirror_crop_box_parameters.min_y = declare_parameter<float>("mirror_crop.min_y");
+  mirror_crop_box_parameters.min_z = declare_parameter<float>("mirror_crop.min_z");
+  mirror_crop_box_parameters.max_x = declare_parameter<float>("mirror_crop.max_x");
+  mirror_crop_box_parameters.max_y = declare_parameter<float>("mirror_crop.max_y");
+  mirror_crop_box_parameters.max_z = declare_parameter<float>("mirror_crop.max_z");
+
+  bool use_3d_undistortion = declare_parameter<bool>("use_3d_distortion_correction");
+  bool use_imu = declare_parameter<bool>("use_imu");
+
+  // Subscriber
+  sub_ =
+    std::make_unique<cuda_blackboard::CudaBlackboardSubscriber<cuda_blackboard::CudaPointCloud2>>(
+      *this, "~/input/pointcloud", false,
+      std::bind(&CudaPointcloudPreprocessorNode::pointcloudCallback, this, _1));
+
+  // Publisher
+  pub_ =
+    std::make_unique<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>>(
+      *this, "~/output/pointcloud");
+  twist_sub_ = this->create_subscription<geometry_msgs::msg::TwistWithCovarianceStamped>(
+    "~/input/twist", 10,
+    std::bind(&CudaPointcloudPreprocessorNode::twistCallback, this, std::placeholders::_1));
+
+  if (use_imu) {
+    imu_sub_ = this->create_subscription<sensor_msgs::msg::Imu>(
+      "~/input/imu", 10,
+      std::bind(&CudaPointcloudPreprocessorNode::imuCallback, this, std::placeholders::_1));
+  }
+
+  cuda_pointcloud_preprocessor_ = std::make_unique<CudaPointcloudPreprocessor>();
+  cuda_pointcloud_preprocessor_->setRingOutlierFilterParameters(ring_outlier_filter_parameters);
+  cuda_pointcloud_preprocessor_->setCropBoxParameters(
+    self_crop_box_parameters, mirror_crop_box_parameters);
+  cuda_pointcloud_preprocessor_->set3DUndistortion(use_3d_undistortion);
+
+  // initialize debug tool
+  {
+    using autoware::universe_utils::DebugPublisher;
+    using autoware::universe_utils::StopWatch;
+
+    stop_watch_ptr_ = std::make_unique<StopWatch<std::chrono::milliseconds>>();
+    debug_publisher_ = std::make_unique<DebugPublisher>(this, "cuda_pointcloud_preprocessor");
+    stop_watch_ptr_->tic("processing_time");
+  }
+}
+
+bool CudaPointcloudPreprocessorNode::getTransform(
+  const std::string & target_frame, const std::string & source_frame,
+  tf2::Transform * tf2_transform_ptr)
+{
+  if (target_frame == source_frame) {
+    tf2_transform_ptr->setOrigin(tf2::Vector3(0.0, 0.0, 0.0));
+    tf2_transform_ptr->setRotation(tf2::Quaternion(0.0, 0.0, 0.0, 1.0));
+    return true;
+  }
+
+  try {
+    const auto transform_msg =
+      tf2_buffer_.lookupTransform(target_frame, source_frame, tf2::TimePointZero);
+    tf2::convert(transform_msg.transform, *tf2_transform_ptr);
+  } catch (const tf2::TransformException & ex) {
+    RCLCPP_WARN(get_logger(), "%s", ex.what());
+    RCLCPP_ERROR(
+      get_logger(), "Please publish TF %s to %s", target_frame.c_str(), source_frame.c_str());
+
+    tf2_transform_ptr->setOrigin(tf2::Vector3(0.0, 0.0, 0.0));
+    tf2_transform_ptr->setRotation(tf2::Quaternion(0.0, 0.0, 0.0, 1.0));
+    return false;
+  }
+  return true;
+}
+
+void CudaPointcloudPreprocessorNode::twistCallback(
+  const geometry_msgs::msg::TwistWithCovarianceStamped::ConstSharedPtr twist_msg_ptr)
+{
+  twist_queue_.push_back(*twist_msg_ptr);
+
+  while (!twist_queue_.empty()) {
+    // for replay rosbag
+    if (
+      rclcpp::Time(twist_queue_.front().header.stamp) > rclcpp::Time(twist_msg_ptr->header.stamp)) {
+      twist_queue_.pop_front();
+    } else if (  // NOLINT
+      rclcpp::Time(twist_queue_.front().header.stamp) <
+      rclcpp::Time(twist_msg_ptr->header.stamp) - rclcpp::Duration::from_seconds(1.0)) {
+      twist_queue_.pop_front();
+    }
+    break;
+  }
+}
+
+void CudaPointcloudPreprocessorNode::imuCallback(
+  const sensor_msgs::msg::Imu::ConstSharedPtr imu_msg)
+{
+  tf2::Transform tf2_imu_link_to_base_link{};
+  getTransform(base_frame_, imu_msg->header.frame_id, &tf2_imu_link_to_base_link);
+  geometry_msgs::msg::TransformStamped::SharedPtr tf_base2imu_ptr =
+    std::make_shared<geometry_msgs::msg::TransformStamped>();
+  tf_base2imu_ptr->transform.rotation = tf2::toMsg(tf2_imu_link_to_base_link.getRotation());
+
+  geometry_msgs::msg::Vector3Stamped angular_velocity;
+  angular_velocity.vector = imu_msg->angular_velocity;
+
+  geometry_msgs::msg::Vector3Stamped transformed_angular_velocity;
+  tf2::doTransform(angular_velocity, transformed_angular_velocity, *tf_base2imu_ptr);
+  transformed_angular_velocity.header = imu_msg->header;
+  angular_velocity_queue_.push_back(transformed_angular_velocity);
+
+  while (!angular_velocity_queue_.empty()) {
+    // for rosbag replay
+    if (
+      rclcpp::Time(angular_velocity_queue_.front().header.stamp) >
+      rclcpp::Time(imu_msg->header.stamp)) {
+      angular_velocity_queue_.pop_front();
+    } else if (  // NOLINT
+      rclcpp::Time(angular_velocity_queue_.front().header.stamp) <
+      rclcpp::Time(imu_msg->header.stamp) - rclcpp::Duration::from_seconds(1.0)) {
+      angular_velocity_queue_.pop_front();
+    }
+    break;
+  }
+}
+
+void CudaPointcloudPreprocessorNode::pointcloudCallback(
+  const std::shared_ptr<const cuda_blackboard::CudaPointCloud2> input_pointcloud_msg_ptr)
+{
+  static_assert(
+    sizeof(InputPointType) == sizeof(autoware::point_types::PointXYZIRCAEDT),
+    "PointStruct and PointXYZIRCAEDT must have the same size");
+
+  stop_watch_ptr_->toc("processing_time", true);
+
+  // Make sure that the first twist is newer than the first point
+  InputPointType first_point;
+  cudaMemcpy(
+    &first_point, input_pointcloud_msg_ptr->data.get(), sizeof(InputPointType),
+    cudaMemcpyDeviceToHost);
+  double first_point_stamp = input_pointcloud_msg_ptr->header.stamp.sec +
+                             input_pointcloud_msg_ptr->header.stamp.nanosec * 1e-9 +
+                             first_point.time_stamp * 1e-9;
+
+  while (twist_queue_.size() > 1 &&
+         rclcpp::Time(twist_queue_.front().header.stamp).seconds() < first_point_stamp) {
+    twist_queue_.pop_front();
+  }
+
+  while (angular_velocity_queue_.size() > 1 &&
+         rclcpp::Time(angular_velocity_queue_.front().header.stamp).seconds() < first_point_stamp) {
+    angular_velocity_queue_.pop_front();
+  }
+
+  // Obtain the base link to input pointcloud transform
+  geometry_msgs::msg::TransformStamped transform_msg;
+
+  try {
+    transform_msg = tf2_buffer_.lookupTransform(
+      base_frame_, input_pointcloud_msg_ptr->header.frame_id, tf2::TimePointZero);
+  } catch (const tf2::TransformException & ex) {
+    RCLCPP_WARN(get_logger(), "%s", ex.what());
+    return;
+  }
+
+  auto output_pointcloud_ptr = cuda_pointcloud_preprocessor_->process(
+    *input_pointcloud_msg_ptr, transform_msg, twist_queue_, angular_velocity_queue_);
+  output_pointcloud_ptr->header.frame_id = base_frame_;
+
+  // Publish
+  pub_->publish(std::move(output_pointcloud_ptr));
+
+  if (debug_publisher_) {
+    const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true);
+    debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>(
+      "debug/processing_time_ms", processing_time_ms);
+
+    double now_stamp_seconds = rclcpp::Time(this->get_clock()->now()).seconds();
+    double cloud_stamp_seconds = rclcpp::Time(input_pointcloud_msg_ptr->header.stamp).seconds();
+
+    debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>(
+      "debug/latency_ms", 1000.f * (now_stamp_seconds - cloud_stamp_seconds));
+  }
+
+  // Preallocate for next iteration
+  cuda_pointcloud_preprocessor_->preallocateOutput();
+}
+
+}  // namespace autoware::cuda_pointcloud_preprocessor
+
+#include "rclcpp_components/register_node_macro.hpp"
+RCLCPP_COMPONENTS_REGISTER_NODE(
+  autoware::cuda_pointcloud_preprocessor::CudaPointcloudPreprocessorNode)

From 774e0990b417ceca330fc0893738c7f43f72874d Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Mon, 25 Nov 2024 18:05:51 +0900
Subject: [PATCH 02/38] chore: fixed incorrect links

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../docs/cuda-pointcloud-preprocessor.md                        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
index 459f15569d7ca..a82d49ba3d6ca 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
@@ -11,7 +11,7 @@ In particular, this node implements:
 
 ## Inner-workings / Algorithms
 
-As this node reimplements the functionalities of the CPU-version algorithms, please have a look at the documentations of [crop-box](/sensing/autoware_pointcloud_preprocessor/docs/crop-box-filter.md), [distortion correction](/sensing/autoware_pointcloud_preprocessor/docs/distortion-corrector.md), and [ring-based outlier filter](/sensing/autoware_pointcloud_preprocessor/docs/ring-outlier-filter.md) for more information about these algorithms.
+As this node reimplements the functionalities of the CPU-version algorithms, please have a look at the documentations of [crop-box](../../autoware_pointcloud_preprocessor/docs/crop-box-filter.md), [distortion correction](../../sensing/autoware_pointcloud_preprocessor/docs/distortion-corrector.md), and [ring-based outlier filter](../../sensing/autoware_pointcloud_preprocessor/docs/ring-outlier-filter.md) for more information about these algorithms.
 
 In addition to the individual algorithms previously mentioned, this node uses the `cuda_blackboard`, a cuda transport layer that enables a zero-copy mechanism between GPU and GPU memory for both input and output.
 

From be04f760fe40da54c03ca7b5546bf37f2012ac5d Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Mon, 25 Nov 2024 18:09:23 +0900
Subject: [PATCH 03/38] chore: fixed dead links pt2

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../docs/cuda-pointcloud-preprocessor.md                        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
index a82d49ba3d6ca..69d800794701b 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
@@ -11,7 +11,7 @@ In particular, this node implements:
 
 ## Inner-workings / Algorithms
 
-As this node reimplements the functionalities of the CPU-version algorithms, please have a look at the documentations of [crop-box](../../autoware_pointcloud_preprocessor/docs/crop-box-filter.md), [distortion correction](../../sensing/autoware_pointcloud_preprocessor/docs/distortion-corrector.md), and [ring-based outlier filter](../../sensing/autoware_pointcloud_preprocessor/docs/ring-outlier-filter.md) for more information about these algorithms.
+As this node reimplements the functionalities of the CPU-version algorithms, please have a look at the documentations of [crop-box](../../autoware_pointcloud_preprocessor/docs/crop-box-filter.md), [distortion correction](../../autoware_pointcloud_preprocessor/docs/distortion-corrector.md), and [ring-based outlier filter](../../autoware_pointcloud_preprocessor/docs/ring-outlier-filter.md) for more information about these algorithms.
 
 In addition to the individual algorithms previously mentioned, this node uses the `cuda_blackboard`, a cuda transport layer that enables a zero-copy mechanism between GPU and GPU memory for both input and output.
 

From db02ec78f27b2da0f94b0e06b231cc79881bd25d Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Mon, 25 Nov 2024 18:21:44 +0900
Subject: [PATCH 04/38] chore: fixed spelling errors

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../CMakeLists.txt                            |  2 +-
 .../README.md                                 |  2 +-
 .../cuda_pointcloud_preprocessor.hpp          | 12 ++--------
 .../cuda_pointcloud_preprocessor_node.hpp     |  1 -
 .../cuda_pointcloud_preprocessor.cu           | 22 +++++++++----------
 5 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
index 06b1f5d67540f..d09f787a3b277 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
@@ -32,7 +32,7 @@ include_directories(
     ${CUDA_INCLUDE_DIRS}
 )
 
-list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 20012")
+list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 20012") # cSpell: ignore expt
 
 cuda_add_library(cuda_pointcloud_preprocessor_lib SHARED
   src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/README.md b/sensing/autoware_cuda_pointcloud_preprocessor/README.md
index d15082b69a661..25d29a678b1ce 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/README.md
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/README.md
@@ -4,7 +4,7 @@
 
 The pointcloud preprocessing implemented in `autoware_pointcloud_preprocessor` has been thoroughly tested in autoware. However, the latency it introduces does not scale well with modern LiDAR devices due to the high number of points they introduce.
 
-To alleviate this issue, this package reimplements most of the pipeline presented in `autoware_pointcloud_preprocessor` leveraging the use of GPGPUs. In particular, this package makes use of CUDA to provide accelerated versions of the already establishes implementations, while also maintaining compatibility with normal ROS nodes/topics.
+To alleviate this issue, this package reimplements most of the pipeline presented in `autoware_pointcloud_preprocessor` leveraging the use of GPGPUs. In particular, this package makes use of CUDA to provide accelerated versions of the already establishes implementations, while also maintaining compatibility with normal ROS nodes/topics. <!-- cSpell: ignore GPGPUs >
 
 ## Inner-workings / Algorithms
 
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
index b4ae4eecc304e..2a22c340a268c 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
@@ -43,23 +43,15 @@ struct TwistStruct2D
   float cum_sin_theta;
   uint32_t last_stamp_nsec;  // relative to the start of the pointcloud
   uint32_t stamp_nsec;       // relative to the start of the pointcloud
-  float vx;
-  float vtheta;
+  float v_x;
+  float v_theta;
 };
 
 struct TwistStruct3D
 {
-  // Eigen::Affine3f cum_transform;
   float cum_transform_buffer[16];
-  /* float cum_x;
-  float cum_y;
-  float cum_theta; */
-  /* float cum_cos_theta;
-  float cum_sin_theta; */
   uint32_t last_stamp_nsec;  // relative to the start of the pointcloud
   uint32_t stamp_nsec;       // relative to the start of the pointcloud
-  /* float vx;
-  float vtheta; */
   float v[3];
   float w[3];
 };
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
index 5fdf4258eebcb..a5c5361c92515 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
@@ -91,7 +91,6 @@ class CudaPointcloudPreprocessorNode : public rclcpp::Node
   tf2_ros::TransformListener tf2_listener_;
 
   std::string base_frame_;
-  bool imu_tranform_valid_{false};
   std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> twist_queue_;
   std::deque<geometry_msgs::msg::Vector3Stamped> angular_velocity_queue_;
 
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index ad8910aef5b76..ec2fe8fc03e3b 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -176,8 +176,8 @@ __global__ void undistort2dKernel(
       point.time_stamp > twist.last_stamp_nsec ? point.time_stamp - twist.last_stamp_nsec : 0;
     double dt = 1e-9 * (dt_nsec);
 
-    theta += twist.vtheta * dt;
-    float d = twist.vx * dt;
+    theta += twist.v_theta * dt;
+    float d = twist.v_x * dt;
     x += d * cos(theta);
     y += d * sin(theta);
 
@@ -400,16 +400,16 @@ void CudaPointcloudPreprocessor::setupTwist2DStructs(
   for (; twist_index < twist_queue.size() ||
          angular_velocity_index < angular_velocity_queue.size();) {
     uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec;
-    float vx, vtheta;
+    float v_x, v_theta;
 
     if (twist_index < twist_queue.size()) {
       input_twist_global_stamp_nsec =
         1'000'000'000 * static_cast<uint64_t>(twist_queue[twist_index].header.stamp.sec) +
         static_cast<uint64_t>(twist_queue[twist_index].header.stamp.nanosec);
-      vx = twist_queue[twist_index].twist.twist.linear.x;
+      v_x = twist_queue[twist_index].twist.twist.linear.x;
     } else {
       input_twist_global_stamp_nsec = std::numeric_limits<uint64_t>::max();
-      vx = 0.0;
+      v_x = 0.0;
     }
 
     if (angular_velocity_index < angular_velocity_queue.size()) {
@@ -417,10 +417,10 @@ void CudaPointcloudPreprocessor::setupTwist2DStructs(
         1'000'000'000 *
           static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.sec) +
         static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.nanosec);
-      vtheta = angular_velocity_queue[angular_velocity_index].vector.z;
+      v_theta = angular_velocity_queue[angular_velocity_index].vector.z;
     } else {
       angular_velocity_global_stamp_nsec = std::numeric_limits<uint64_t>::max();
-      vtheta = 0.0;
+      v_theta = 0.0;
     }
 
     if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) {
@@ -444,15 +444,15 @@ void CudaPointcloudPreprocessor::setupTwist2DStructs(
     uint32_t twist_from_pointcloud_start_nsec = twist_global_stamp_nsec - pointcloud_stamp_nsec;
 
     twist.stamp_nsec = twist_from_pointcloud_start_nsec;
-    twist.vx = vx;
-    twist.vtheta = vtheta;
+    twist.v_x = v_x;
+    twist.v_theta = v_theta;
     twist.last_stamp_nsec = last_stamp_nsec;
     host_twist_structs.push_back(twist);
 
     double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec);
     last_stamp_nsec = twist.stamp_nsec;
-    cum_theta += vtheta * dt_seconds;
-    float d = twist.vx * dt_seconds;
+    cum_theta += v_theta * dt_seconds;
+    float d = twist.v_x * dt_seconds;
     cum_x += d * cos(cum_theta);
     cum_y += d * sin(cum_theta);
   }

From 5218a4aad764a816f4b4a14bd60e656910b444b4 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Mon, 25 Nov 2024 18:33:54 +0900
Subject: [PATCH 05/38] chore: json schema fixes

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../schema/cuda_pointcloud_preprocessor.schema.json       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json
index 196f455ffe75b..cc94da3523512 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json
@@ -3,7 +3,7 @@
   "title": "Parameters for the cuda_pointcloud_preprocessor",
   "type": "object",
   "definitions": {
-    "crop_box_filter": {
+    "cuda_pointcloud_preprocessor": {
       "type": "object",
       "properties": {
         "base_frame": {
@@ -23,19 +23,19 @@
         },
         "distance_ratio": {
           "type": "number",
-          "description": "distance_ratio",
+          "description": "distance ratio between consecutive points to determine is they belong to the same walk",
           "default": "1.03",
           "minimum": 0.0
         },
         "object_length_threshold": {
           "type": "number",
-          "description": "object_length_threshold",
+          "description": "minimum object length to be considered a valid cluster in meters",
           "default": "0.1",
           "minimum": 0.0
         },
         "num_points_threshold": {
           "type": "integer",
-          "description": "num_points_threshold",
+          "description": "threshold on the number of points in a walk to determine if it corresponds to a cluster",
           "default": "4",
           "minimum": 0
         },

From 4a9daaf74613d28c91a3fd3e171649069be19258 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Tue, 26 Nov 2024 13:39:27 +0900
Subject: [PATCH 06/38] chore: removed comments and filled the fields

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../cuda_organized_pointcloud_adapter_node.cpp      | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
index bd5717fbbcb5e..07f83230f21fe 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
@@ -86,17 +86,9 @@ void CudaOrganizedPointcloudAdapterNode::estimatePointcloudRingInfo(
   std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0);
   buffer_.resize(num_rings_ * max_points_per_ring_);
 
-  /* if (device_buffer_ != nullptr) {
-    cudaFree(device_buffer_);
-  } */
-
   device_buffer_ = cuda_blackboard::make_unique<std::uint8_t[]>(
     num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT));
 
-  /* cudaMalloc(
-    device_buffer_,
-    num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); */
-
   RCLCPP_INFO_STREAM(
     get_logger(), "Estimated rings: " << num_rings_
                                       << ", max_points_per_ring: " << max_points_per_ring_
@@ -158,6 +150,7 @@ void CudaOrganizedPointcloudAdapterNode::pointcloudCallback(
     cudaMemcpyHostToDevice);
 
   auto cuda_pointcloud_msg_ptr = std::make_unique<cuda_blackboard::CudaPointCloud2>();
+  cuda_pointcloud_msg_ptr->fields = input_pointcloud_msg_ptr->fields;
   cuda_pointcloud_msg_ptr->width = max_points_per_ring_;
   cuda_pointcloud_msg_ptr->height = num_rings_;
   cuda_pointcloud_msg_ptr->point_step = sizeof(autoware::point_types::PointXYZIRCAEDT);
@@ -186,9 +179,7 @@ void CudaOrganizedPointcloudAdapterNode::pointcloudCallback(
   // Allocate cuda memory
   device_buffer_ = cuda_blackboard::make_unique<std::uint8_t[]>(
     num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT));
-  /* cudaMalloc(
-    &device_buffer_,
-    num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); */
+
   // Clear indexes
   std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0);
 

From 84a4b9f11c9d562297c1e5445e1b246e5ebdd03d Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Tue, 26 Nov 2024 14:24:58 +0900
Subject: [PATCH 07/38] fix: fixed the adapter for the case when the number of
 points in the pointcloud changes after the first iteration

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../cuda_organized_pointcloud_adapter_node.cpp                  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
index 07f83230f21fe..c5864129d05c9 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
@@ -119,7 +119,7 @@ bool CudaOrganizedPointcloudAdapterNode::orderPointcloud(
     point_overflow |= raw_index >= max_points_per_ring_;
 
     buffer_[ring * max_points_per_ring_ + index] = point;
-    next_ring_index_[ring] = (index + 1) % max_points_per_ring_;
+    next_ring_index_[ring] = raw_index + 1;
   }
 
   return !ring_overflow && !point_overflow;

From b3c1d7279e7a26fc7a134567b17c1dec5402d064 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Mon, 23 Dec 2024 14:04:55 +0900
Subject: [PATCH 08/38] feat: used the cuda host allocators for aster host to
 device copies

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../cuda_organized_pointcloud_adapter_node.hpp     |  3 +--
 .../cuda_organized_pointcloud_adapter_node.cpp     | 14 ++++++++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp
index 8930a1f87e63b..7a31442f26f1d 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp
@@ -64,8 +64,7 @@ class CudaOrganizedPointcloudAdapterNode : public rclcpp::Node
   std::size_t max_points_per_ring_{0};
 
   std::vector<std::size_t> next_ring_index_;
-  std::vector<autoware::point_types::PointXYZIRCAEDT> buffer_;
-  // autoware::point_types::PointXYZIRCAEDT * device_buffer_;
+  cuda_blackboard::HostUniquePtr<autoware::point_types::PointXYZIRCAEDT[]> host_buffer_;
   cuda_blackboard::CudaUniquePtr<std::uint8_t[]> device_buffer_;
 
   std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_;
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
index c5864129d05c9..b7eeccafa2d5a 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
@@ -84,7 +84,8 @@ void CudaOrganizedPointcloudAdapterNode::estimatePointcloudRingInfo(
 
   next_ring_index_.resize(num_rings_);
   std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0);
-  buffer_.resize(num_rings_ * max_points_per_ring_);
+  host_buffer_ = cuda_blackboard::make_host_unique<autoware::point_types::PointXYZIRCAEDT[]>(
+    num_rings_ * max_points_per_ring_);
 
   device_buffer_ = cuda_blackboard::make_unique<std::uint8_t[]>(
     num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT));
@@ -106,6 +107,8 @@ bool CudaOrganizedPointcloudAdapterNode::orderPointcloud(
   bool ring_overflow = false;
   bool point_overflow = false;
 
+  autoware::point_types::PointXYZIRCAEDT * buffer = host_buffer_.get();
+
   for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
        i++) {
     const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i];
@@ -118,7 +121,7 @@ bool CudaOrganizedPointcloudAdapterNode::orderPointcloud(
     ring_overflow |= raw_ring >= num_rings_;
     point_overflow |= raw_index >= max_points_per_ring_;
 
-    buffer_[ring * max_points_per_ring_ + index] = point;
+    buffer[ring * max_points_per_ring_ + index] = point;
     next_ring_index_[ring] = raw_index + 1;
   }
 
@@ -145,7 +148,7 @@ void CudaOrganizedPointcloudAdapterNode::pointcloudCallback(
 
   // Copy to cuda memory
   cudaMemcpy(
-    device_buffer_.get(), buffer_.data(),
+    device_buffer_.get(), host_buffer_.get(),
     num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT),
     cudaMemcpyHostToDevice);
 
@@ -184,7 +187,10 @@ void CudaOrganizedPointcloudAdapterNode::pointcloudCallback(
   std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0);
 
   // Clear pointcloud buffer
-  std::fill(buffer_.begin(), buffer_.end(), autoware::point_types::PointXYZIRCAEDT{});
+  auto host_buffer = host_buffer_.get();
+  for (std::size_t i = 0; i < num_rings_ * max_points_per_ring_; i++) {
+    host_buffer[i] = autoware::point_types::PointXYZIRCAEDT{};
+  }
 }
 
 }  // namespace autoware::cuda_organized_pointcloud_adapter

From 887f162beba008759e3b56d30af99005c5448ad5 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp>
Date: Fri, 10 Jan 2025 18:29:18 +0900
Subject: [PATCH 09/38] Update
 sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md

Co-authored-by: Max Schmeller <6088931+mojomex@users.noreply.github.com>
---
 .../docs/cuda-pointcloud-preprocessor.md                        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
index 69d800794701b..2d360321627d2 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
@@ -41,6 +41,6 @@ In addition to the individual algorithms previously mentioned, this node uses th
 
 ## Assumptions / Known limits
 
-- The CUDA implementations, while following the original CPU ones, will not offer the same numerical results, and small approximations were needed to maximize the GPU use.
+- The CUDA implementations, while following the original CPU ones, will not offer the same numerical results, and small approximations were needed to maximize GPU usage.
 - This node expects that the input pointcloud follows the `autoware::point_types::PointXYZIRCAEDT` layout and the output pointcloud will use the `autoware::point_types::PointXYZIRC` layout defined in the `autoware_point_types` package.
 - The input pointcloud must be a 2D tensor where each row represents a different channel/ring with its points in non-decreasing azimuth order. Invalid points should contain 0-values. The `cuda_organized_pointcloud_adapter` provides such a pointcloud.

From 0b46fb68140c843b5d03d2df21a76d8eaa6c6bfe Mon Sep 17 00:00:00 2001
From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp>
Date: Fri, 10 Jan 2025 18:31:45 +0900
Subject: [PATCH 10/38] Update
 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu

Co-authored-by: Manato Hirabayashi <3022416+manato@users.noreply.github.com>
---
 .../cuda_pointcloud_preprocessor.cu                             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index ec2fe8fc03e3b..d7091f88d3c5a 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -107,7 +107,7 @@ __global__ void cropBoxKernel(
 }
 
 __global__ void combineMasksKernel(
-  uint32_t * mask1, uint32_t * mask2, uint32_t * mask3, int num_points, uint32_t * output_mask)
+  const uint32_t * __restrict__ mask1, const uint32_t * __restrict__ mask2, const uint32_t * __restrict__ mask3, int num_points, uint32_t * output_mask)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < num_points) {

From 970055e43ec88b8159868f32f169c877fd3311cf Mon Sep 17 00:00:00 2001
From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp>
Date: Fri, 10 Jan 2025 18:32:35 +0900
Subject: [PATCH 11/38] Update
 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu

Co-authored-by: Manato Hirabayashi <3022416+manato@users.noreply.github.com>
---
 .../cuda_pointcloud_preprocessor.cu                             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index d7091f88d3c5a..b1793d56e9a8a 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -71,7 +71,7 @@ __host__ __device__ Eigen::Matrix4f transformationMatrixFromVelocity(
 }
 
 __global__ void transformPointsKernel(
-  const InputPointType * input_points, InputPointType * output_points, int num_points,
+  const InputPointType * __restrict__ input_points, InputPointType * output_points, int num_points,
   TransformStruct transform)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;

From 71b8a6fbc3d8927240ed05fc75de7f16e6fb9dc0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 10 Jan 2025 09:35:04 +0000
Subject: [PATCH 12/38] style(pre-commit): autofix

---
 .../cuda_pointcloud_preprocessor.cu                            | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index b1793d56e9a8a..4c769fd24471f 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -107,7 +107,8 @@ __global__ void cropBoxKernel(
 }
 
 __global__ void combineMasksKernel(
-  const uint32_t * __restrict__ mask1, const uint32_t * __restrict__ mask2, const uint32_t * __restrict__ mask3, int num_points, uint32_t * output_mask)
+  const uint32_t * __restrict__ mask1, const uint32_t * __restrict__ mask2,
+  const uint32_t * __restrict__ mask3, int num_points, uint32_t * output_mask)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < num_points) {

From d6ee47f7c7fc2c205eb12db7b7d47dfe3cc9b663 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp>
Date: Fri, 10 Jan 2025 18:39:19 +0900
Subject: [PATCH 13/38] Update
 sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md

Co-authored-by: Max Schmeller <6088931+mojomex@users.noreply.github.com>
---
 .../docs/cuda-pointcloud-preprocessor.md                        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
index 2d360321627d2..4a7e2d070ccb6 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
@@ -5,7 +5,7 @@
 This node implements all standard pointcloud preprocessing algorithms applied to a single LiDAR's pointcloud in CUDA.
 In particular, this node implements:
 
-- crop boxing (ego-vehicle and ego-vehicle's mirrors)
+- box cropping (ego-vehicle and ego-vehicle's mirrors)
 - distortion correction
 - ring-based outlier filtering
 

From 3f6953ff2e3c7db46309aa30b3e05a3bad72a23c Mon Sep 17 00:00:00 2001
From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp>
Date: Fri, 10 Jan 2025 18:46:14 +0900
Subject: [PATCH 14/38] Update
 sensing/autoware_cuda_pointcloud_preprocessor/README.md

Co-authored-by: Max Schmeller <6088931+mojomex@users.noreply.github.com>
---
 sensing/autoware_cuda_pointcloud_preprocessor/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/README.md b/sensing/autoware_cuda_pointcloud_preprocessor/README.md
index 25d29a678b1ce..984da0bcaa518 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/README.md
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/README.md
@@ -4,7 +4,7 @@
 
 The pointcloud preprocessing implemented in `autoware_pointcloud_preprocessor` has been thoroughly tested in autoware. However, the latency it introduces does not scale well with modern LiDAR devices due to the high number of points they introduce.
 
-To alleviate this issue, this package reimplements most of the pipeline presented in `autoware_pointcloud_preprocessor` leveraging the use of GPGPUs. In particular, this package makes use of CUDA to provide accelerated versions of the already establishes implementations, while also maintaining compatibility with normal ROS nodes/topics. <!-- cSpell: ignore GPGPUs >
+To alleviate this issue, this package reimplements most of the pipeline presented in `autoware_pointcloud_preprocessor` leveraging the use of GPGPUs. In particular, this package makes use of CUDA to provide accelerated versions of the already established implementations, while also maintaining compatibility with normal ROS nodes/topics. <!-- cSpell: ignore GPGPUs >
 
 ## Inner-workings / Algorithms
 

From db99ba64a6528a38a2be98b5ec8cbdfe40e82017 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp>
Date: Fri, 10 Jan 2025 18:47:54 +0900
Subject: [PATCH 15/38] Update
 sensing/autoware_cuda_pointcloud_preprocessor/README.md

Co-authored-by: Max Schmeller <6088931+mojomex@users.noreply.github.com>
---
 sensing/autoware_cuda_pointcloud_preprocessor/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/README.md b/sensing/autoware_cuda_pointcloud_preprocessor/README.md
index 984da0bcaa518..3c87883971e7e 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/README.md
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/README.md
@@ -13,7 +13,7 @@ A detailed description of each filter's algorithm is available in the following
 | Filter Name                       | Description                                                                                                                                  | Detail                                            |
 | --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------- |
 | cuda_organized_pointcloud_adapter | Organizes a pointcloud per ring/channel, so that the memory layout allows parallel processing in cuda                                        | [link](docs/cuda-organized-pointcloud-adapter.md) |
-| cuda_pointcloud_preprocessor      | Implements the cropping, distortion correction, and outlier filtering (ring-based) of the `autoware_pointcloud_preprocessor`'s cpu versions. | [link](docs/cuda-pointcloud-preprocessor.md)      |
+| cuda_pointcloud_preprocessor      | Implements the cropping, distortion correction, and outlier filtering (ring-based) of the `autoware_pointcloud_preprocessor`'s CPU versions. | [link](docs/cuda-pointcloud-preprocessor.md)      |
 
 ## (Optional) Future extensions / Unimplemented parts
 

From 9ce2e06ca3dc0e2aa23296cdf10ce528d1eacc2b Mon Sep 17 00:00:00 2001
From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp>
Date: Fri, 10 Jan 2025 19:09:27 +0900
Subject: [PATCH 16/38] Update
 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu

Co-authored-by: Max Schmeller <6088931+mojomex@users.noreply.github.com>
---
 .../cuda_pointcloud_preprocessor.cu                         | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index 4c769fd24471f..b43d000f82614 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -98,11 +98,7 @@ __global__ void cropBoxKernel(
     const float y = d_points[idx].y;
     const float z = d_points[idx].z;
 
-    if (x > min_x && x < max_x && y > min_y && y < max_y && z > min_z && z < max_z) {
-      output_mask[idx] = 0;
-    } else {
-      output_mask[idx] = 1;
-    }
+    output_mask[idx] = (x <= min_x || x >= max_x) || (y <= min_y || y >= max_y) || (z <= min_z || z >= max_z);
   }
 }
 

From f4749002d9da7753d088845dec1d5a3ac63c596e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 10 Jan 2025 10:12:18 +0000
Subject: [PATCH 17/38] style(pre-commit): autofix

---
 .../cuda_pointcloud_preprocessor.cu                            | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index b43d000f82614..07450a3b77ecc 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -98,7 +98,8 @@ __global__ void cropBoxKernel(
     const float y = d_points[idx].y;
     const float z = d_points[idx].z;
 
-    output_mask[idx] = (x <= min_x || x >= max_x) || (y <= min_y || y >= max_y) || (z <= min_z || z >= max_z);
+    output_mask[idx] =
+      (x <= min_x || x >= max_x) || (y <= min_y || y >= max_y) || (z <= min_z || z >= max_z);
   }
 }
 

From 594143567849e13a2fc91496b2fa74b95cb031e9 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp>
Date: Tue, 21 Jan 2025 09:16:00 +0900
Subject: [PATCH 18/38] Update
 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu

Co-authored-by: Manato Hirabayashi <3022416+manato@users.noreply.github.com>
---
 .../cuda_pointcloud_preprocessor.cu                             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index 07450a3b77ecc..a37d05ff91313 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -89,7 +89,7 @@ __global__ void transformPointsKernel(
 }
 
 __global__ void cropBoxKernel(
-  InputPointType * d_points, uint32_t * output_mask, int num_points, float min_x, float min_y,
+  InputPointType * __restrict__ d_points, uint32_t * output_mask, int num_points, float min_x, float min_y,
   float min_z, float max_x, float max_y, float max_z)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;

From 90ad0b23c606764ffa9c3f4089bfb38f258ed84a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 21 Jan 2025 00:18:40 +0000
Subject: [PATCH 19/38] style(pre-commit): autofix

---
 .../cuda_pointcloud_preprocessor.cu                           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index a37d05ff91313..e78b65d53d4c4 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -89,8 +89,8 @@ __global__ void transformPointsKernel(
 }
 
 __global__ void cropBoxKernel(
-  InputPointType * __restrict__ d_points, uint32_t * output_mask, int num_points, float min_x, float min_y,
-  float min_z, float max_x, float max_y, float max_z)
+  InputPointType * __restrict__ d_points, uint32_t * output_mask, int num_points, float min_x,
+  float min_y, float min_z, float max_x, float max_y, float max_z)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < num_points) {

From 13aff8cdce6b1112e033a4a7444dfcfcb2e45eac Mon Sep 17 00:00:00 2001
From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp>
Date: Tue, 21 Jan 2025 09:27:51 +0900
Subject: [PATCH 20/38] Update
 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu

Co-authored-by: Manato Hirabayashi <3022416+manato@users.noreply.github.com>
---
 .../cuda_pointcloud_preprocessor.cu           | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index e78b65d53d4c4..e7fee664266a1 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -341,8 +341,44 @@ CudaPointcloudPreprocessor::CudaPointcloudPreprocessor()
   point_fields_.push_back(intensity_field);
   point_fields_.push_back(return_type_field);
   point_fields_.push_back(channel_field);
+  cudaMemPoolProps pool_props;
+  memset(&pool_props, 0, sizeof(cudaMemPoolProps));
+  pool_props.allocType = cudaMemAllocationTypePinned;
+  pool_props.handleTypes = cudaMemHandleTypePosixFileDescriptor;
+
+  pool_props.location.type = cudaMemLocationTypeDevice;
+  cudaGetDevice(&(pool_props.location.id));
+
+  // cudaMemPool_t device_memory_pool_ needs to be declared as a member of this class
+  cudaMemPoolCreate(&device_memory_pool_, &pool_props);
+  MemoryPoolAllocator<TwistStruct2D> allocator_2d(device_memory_pool_);
+  MemoryPoolAllocator<TwistStruct3D> allocator_3d(device_memory_pool_);
+  device_twist_2d_structs_
+      = thrust::device_vector<TwistStruct2D, MemoryPoolAllocator<TwistStruct2D>>(allocator_2d);
+  device_twist_3d_structs_
+      = thrust::device_vector<TwistStruct3D, MemoryPoolAllocator<TwistStruct3D>>(allocator_3d);
 }
 
+template <typename T>
+class MemoryPoolAllocator {
+ public:
+  using value_type = T;
+  MemoryPoolAllocator(cudaMemPool_t pool) : m_pool(pool) {}
+
+  T* allocate(std::size_t n) {
+    void* ptr = nullptr;
+    cudaMallocFromPoolAsync(&ptr, n * sizeof(T), m_pool, cudaStreamDefault);
+    return static_cast<T*>(ptr);
+  }
+
+  void deallocate(T* ptr, std::size_t) {
+    cudaFreeAsync(ptr, cudaStreamDefault);
+  }
+
+ protected:
+  cudaMemPool_t m_pool;
+};  // MemoryPoolAllocator
+
 void CudaPointcloudPreprocessor::setCropBoxParameters(
   const CropBoxParameters & self_crop_box_parameters,
   const CropBoxParameters & mirror_crop_box_parameters)

From f0e4c205ad2de03e282e9f8027b385c4ebee7889 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 21 Jan 2025 00:30:23 +0000
Subject: [PATCH 21/38] style(pre-commit): autofix

---
 .../cuda_pointcloud_preprocessor.cu           | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index e7fee664266a1..38d6457f10025 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -353,29 +353,29 @@ CudaPointcloudPreprocessor::CudaPointcloudPreprocessor()
   cudaMemPoolCreate(&device_memory_pool_, &pool_props);
   MemoryPoolAllocator<TwistStruct2D> allocator_2d(device_memory_pool_);
   MemoryPoolAllocator<TwistStruct3D> allocator_3d(device_memory_pool_);
-  device_twist_2d_structs_
-      = thrust::device_vector<TwistStruct2D, MemoryPoolAllocator<TwistStruct2D>>(allocator_2d);
-  device_twist_3d_structs_
-      = thrust::device_vector<TwistStruct3D, MemoryPoolAllocator<TwistStruct3D>>(allocator_3d);
+  device_twist_2d_structs_ =
+    thrust::device_vector<TwistStruct2D, MemoryPoolAllocator<TwistStruct2D>>(allocator_2d);
+  device_twist_3d_structs_ =
+    thrust::device_vector<TwistStruct3D, MemoryPoolAllocator<TwistStruct3D>>(allocator_3d);
 }
 
 template <typename T>
-class MemoryPoolAllocator {
- public:
+class MemoryPoolAllocator
+{
+public:
   using value_type = T;
   MemoryPoolAllocator(cudaMemPool_t pool) : m_pool(pool) {}
 
-  T* allocate(std::size_t n) {
-    void* ptr = nullptr;
+  T * allocate(std::size_t n)
+  {
+    void * ptr = nullptr;
     cudaMallocFromPoolAsync(&ptr, n * sizeof(T), m_pool, cudaStreamDefault);
-    return static_cast<T*>(ptr);
+    return static_cast<T *>(ptr);
   }
 
-  void deallocate(T* ptr, std::size_t) {
-    cudaFreeAsync(ptr, cudaStreamDefault);
-  }
+  void deallocate(T * ptr, std::size_t) { cudaFreeAsync(ptr, cudaStreamDefault); }
 
- protected:
+protected:
   cudaMemPool_t m_pool;
 };  // MemoryPoolAllocator
 

From d05b988ba18630a3831cddf899c657e4680ec9fc Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Tue, 21 Jan 2025 09:52:02 +0900
Subject: [PATCH 22/38] chore: fixed code compilation to reflect
 Hirabayashi-san's  memory pool proposal

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../cuda_pointcloud_preprocessor.hpp          | 21 +++++++++++++++++++
 .../cuda_pointcloud_preprocessor.cu           | 21 -------------------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
index 2a22c340a268c..b53ff31074f24 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
@@ -89,6 +89,26 @@ struct RingOutlierFilterParameters
   std::size_t num_points_threshold{0};
 };
 
+template <typename T>
+class MemoryPoolAllocator
+{
+public:
+  using value_type = T;
+  MemoryPoolAllocator(cudaMemPool_t pool) : m_pool(pool) {}
+
+  T * allocate(std::size_t n)
+  {
+    void * ptr = nullptr;
+    cudaMallocFromPoolAsync(&ptr, n * sizeof(T), m_pool, cudaStreamDefault);
+    return static_cast<T *>(ptr);
+  }
+
+  void deallocate(T * ptr, std::size_t) { cudaFreeAsync(ptr, cudaStreamDefault); }
+
+protected:
+  cudaMemPool_t m_pool;
+};
+
 class CudaPointcloudPreprocessor
 {
 public:
@@ -131,6 +151,7 @@ class CudaPointcloudPreprocessor
   std::vector<sensor_msgs::msg::PointField> point_fields_{};
   std::unique_ptr<cuda_blackboard::CudaPointCloud2> output_pointcloud_ptr_{};
 
+  cudaMemPool_t device_memory_pool_;
   thrust::device_vector<InputPointType> device_transformed_points_{};
   thrust::device_vector<OutputPointType> device_output_points_{};
   thrust::device_vector<uint32_t> device_self_crop_mask_{};
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index 38d6457f10025..596b1ce01332b 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -349,7 +349,6 @@ CudaPointcloudPreprocessor::CudaPointcloudPreprocessor()
   pool_props.location.type = cudaMemLocationTypeDevice;
   cudaGetDevice(&(pool_props.location.id));
 
-  // cudaMemPool_t device_memory_pool_ needs to be declared as a member of this class
   cudaMemPoolCreate(&device_memory_pool_, &pool_props);
   MemoryPoolAllocator<TwistStruct2D> allocator_2d(device_memory_pool_);
   MemoryPoolAllocator<TwistStruct3D> allocator_3d(device_memory_pool_);
@@ -359,26 +358,6 @@ CudaPointcloudPreprocessor::CudaPointcloudPreprocessor()
     thrust::device_vector<TwistStruct3D, MemoryPoolAllocator<TwistStruct3D>>(allocator_3d);
 }
 
-template <typename T>
-class MemoryPoolAllocator
-{
-public:
-  using value_type = T;
-  MemoryPoolAllocator(cudaMemPool_t pool) : m_pool(pool) {}
-
-  T * allocate(std::size_t n)
-  {
-    void * ptr = nullptr;
-    cudaMallocFromPoolAsync(&ptr, n * sizeof(T), m_pool, cudaStreamDefault);
-    return static_cast<T *>(ptr);
-  }
-
-  void deallocate(T * ptr, std::size_t) { cudaFreeAsync(ptr, cudaStreamDefault); }
-
-protected:
-  cudaMemPool_t m_pool;
-};  // MemoryPoolAllocator
-
 void CudaPointcloudPreprocessor::setCropBoxParameters(
   const CropBoxParameters & self_crop_box_parameters,
   const CropBoxParameters & mirror_crop_box_parameters)

From 5e0cc4758ea69a7c2fd710b3b318710d2f80b93e Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Thu, 23 Jan 2025 16:16:46 +0900
Subject: [PATCH 23/38] feat: generalized the number of crop boxes. For two at
 least, the new approach is actually faster

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../cuda_pointcloud_preprocessor.hpp          | 29 ++++---
 .../cuda_pointcloud_preprocessor.cu           | 86 +++++++++++--------
 .../cuda_pointcloud_preprocessor_node.cpp     | 46 ++++++----
 3 files changed, 94 insertions(+), 67 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
index b53ff31074f24..94c5915d08619 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
@@ -74,12 +74,12 @@ struct TransformStruct
 
 struct CropBoxParameters
 {
-  float min_x{std::nanf("")};
-  float max_x{std::nanf("")};
-  float min_y{std::nanf("")};
-  float max_y{std::nanf("")};
-  float min_z{std::nanf("")};
-  float max_z{std::nanf("")};
+  float min_x;
+  float max_x;
+  float min_y;
+  float max_y;
+  float min_z;
+  float max_z;
 };
 
 struct RingOutlierFilterParameters
@@ -115,9 +115,7 @@ class CudaPointcloudPreprocessor
   CudaPointcloudPreprocessor();
   ~CudaPointcloudPreprocessor() = default;
 
-  void setCropBoxParameters(
-    const CropBoxParameters & self_crop_box_parameters,
-    const CropBoxParameters & mirror_crop_box_parameters);
+  void setCropBoxParameters(const std::vector<CropBoxParameters> & crop_box_parameters);
   void setRingOutlierFilterParameters(const RingOutlierFilterParameters & ring_outlier_parameters);
   void set3DUndistortion(bool value);
 
@@ -151,15 +149,20 @@ class CudaPointcloudPreprocessor
   std::vector<sensor_msgs::msg::PointField> point_fields_{};
   std::unique_ptr<cuda_blackboard::CudaPointCloud2> output_pointcloud_ptr_{};
 
+  cudaStream_t stream_;
+  int max_blocks_per_grid_{};
+  const int threads_per_block_{256};
   cudaMemPool_t device_memory_pool_;
   thrust::device_vector<InputPointType> device_transformed_points_{};
   thrust::device_vector<OutputPointType> device_output_points_{};
-  thrust::device_vector<uint32_t> device_self_crop_mask_{};
-  thrust::device_vector<uint32_t> device_mirror_crop_mask_{};
-  thrust::device_vector<uint32_t> device_ring_outlier_mask_{};
-  thrust::device_vector<uint32_t> device_indices_{};
+  thrust::device_vector<std::uint32_t> device_crop_mask_{};
+  thrust::device_vector<std::uint32_t> device_ring_outlier_mask_{};
+  thrust::device_vector<std::uint32_t> device_indices_{};
   thrust::device_vector<TwistStruct2D> device_twist_2d_structs_{};
   thrust::device_vector<TwistStruct3D> device_twist_3d_structs_{};
+
+  thrust::device_vector<CropBoxParameters> host_crop_box_structs_{};
+  thrust::device_vector<CropBoxParameters> device_crop_box_structs_{};
 };
 
 }  // namespace autoware::cuda_pointcloud_preprocessor
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index 596b1ce01332b..0e2cd45eda0f8 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -89,27 +89,40 @@ __global__ void transformPointsKernel(
 }
 
 __global__ void cropBoxKernel(
-  InputPointType * __restrict__ d_points, uint32_t * output_mask, int num_points, float min_x,
-  float min_y, float min_z, float max_x, float max_y, float max_z)
+  InputPointType * __restrict__ d_points, std::uint32_t * __restrict__ output_mask, int num_points,
+  const CropBoxParameters * __restrict__ crop_box_parameters_ptr, int num_crop_boxes)
 {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_points) {
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_points;
+       idx += blockDim.x * gridDim.x) {
     const float x = d_points[idx].x;
     const float y = d_points[idx].y;
     const float z = d_points[idx].z;
 
-    output_mask[idx] =
-      (x <= min_x || x >= max_x) || (y <= min_y || y >= max_y) || (z <= min_z || z >= max_z);
+    std::uint32_t mask = 1;
+
+    for (int i = 0; i < num_crop_boxes; i++) {
+      const CropBoxParameters & crop_box_parameters = crop_box_parameters_ptr[i];
+      const float & min_x = crop_box_parameters.min_x;
+      const float & min_y = crop_box_parameters.min_y;
+      const float & min_z = crop_box_parameters.min_z;
+      const float & max_x = crop_box_parameters.max_x;
+      const float & max_y = crop_box_parameters.max_y;
+      const float & max_z = crop_box_parameters.max_z;
+      mask &=
+        (x <= min_x || x >= max_x) || (y <= min_y || y >= max_y) || (z <= min_z || z >= max_z);
+    }
+
+    output_mask[idx] = mask;
   }
 }
 
 __global__ void combineMasksKernel(
-  const uint32_t * __restrict__ mask1, const uint32_t * __restrict__ mask2,
-  const uint32_t * __restrict__ mask3, int num_points, uint32_t * output_mask)
+  const uint32_t * __restrict__ mask1, const uint32_t * __restrict__ mask2, int num_points,
+  uint32_t * __restrict__ output_mask)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < num_points) {
-    output_mask[idx] = mask1[idx] & mask2[idx] & mask3[idx];
+    output_mask[idx] = mask1[idx] & mask2[idx];
   }
 }
 
@@ -341,6 +354,13 @@ CudaPointcloudPreprocessor::CudaPointcloudPreprocessor()
   point_fields_.push_back(intensity_field);
   point_fields_.push_back(return_type_field);
   point_fields_.push_back(channel_field);
+
+  cudaStreamCreate(&stream_);
+
+  int num_sm;
+  cudaDeviceGetAttribute(&num_sm, cudaDevAttrMultiProcessorCount, 0);
+  max_blocks_per_grid_ = 4 * num_sm;  // used for strided loops
+
   cudaMemPoolProps pool_props;
   memset(&pool_props, 0, sizeof(cudaMemPoolProps));
   pool_props.allocType = cudaMemAllocationTypePinned;
@@ -359,11 +379,10 @@ CudaPointcloudPreprocessor::CudaPointcloudPreprocessor()
 }
 
 void CudaPointcloudPreprocessor::setCropBoxParameters(
-  const CropBoxParameters & self_crop_box_parameters,
-  const CropBoxParameters & mirror_crop_box_parameters)
+  const std::vector<CropBoxParameters> & crop_box_parameters)
 {
-  self_crop_box_parameters_ = self_crop_box_parameters;
-  mirror_crop_box_parameters_ = mirror_crop_box_parameters;
+  host_crop_box_structs_ = crop_box_parameters;
+  device_crop_box_structs_ = host_crop_box_structs_;
 }
 
 void CudaPointcloudPreprocessor::setRingOutlierFilterParameters(
@@ -584,8 +603,7 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr
     max_points_per_ring_ = input_pointcloud_msg.width;
 
     device_transformed_points_.resize(num_input_points);
-    device_self_crop_mask_.resize(num_input_points);
-    device_mirror_crop_mask_.resize(num_input_points);
+    device_crop_mask_.resize(num_input_points);
     device_ring_outlier_mask_.resize(num_input_points);
     device_indices_.resize(num_input_points);
 
@@ -629,49 +647,41 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr
     thrust::raw_pointer_cast(device_twist_3d_structs_.data());
   InputPointType * device_transformed_points =
     thrust::raw_pointer_cast(device_transformed_points_.data());
-  uint32_t * device_self_crop_mask = thrust::raw_pointer_cast(device_self_crop_mask_.data());
-  uint32_t * device_mirror_crop_mask = thrust::raw_pointer_cast(device_mirror_crop_mask_.data());
+  std::uint32_t * device_crop_mask = thrust::raw_pointer_cast(device_crop_mask_.data());
   uint32_t * device_ring_outlier_mask = thrust::raw_pointer_cast(device_ring_outlier_mask_.data());
   uint32_t * device_indices = thrust::raw_pointer_cast(device_indices_.data());
 
-  const int threadsPerBlock = 256;
-  const int blocksPerGrid = (num_input_points + threadsPerBlock - 1) / threadsPerBlock;
+  const int blocks_per_grid = (num_input_points + threads_per_block_ - 1) / threads_per_block_;
 
-  transformPointsKernel<<<blocksPerGrid, threadsPerBlock>>>(
+  transformPointsKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>(
     device_input_points, device_transformed_points, num_input_points, transform_struct);
 
-  cropBoxKernel<<<blocksPerGrid, threadsPerBlock>>>(
-    device_transformed_points, device_self_crop_mask, num_input_points,
-    self_crop_box_parameters_.min_x, self_crop_box_parameters_.min_y,
-    self_crop_box_parameters_.min_z, self_crop_box_parameters_.max_x,
-    self_crop_box_parameters_.max_y, self_crop_box_parameters_.max_z);
-
-  cropBoxKernel<<<blocksPerGrid, threadsPerBlock>>>(
-    device_transformed_points, device_mirror_crop_mask, num_input_points,
-    mirror_crop_box_parameters_.min_x, mirror_crop_box_parameters_.min_y,
-    mirror_crop_box_parameters_.min_z, mirror_crop_box_parameters_.max_x,
-    mirror_crop_box_parameters_.max_y, mirror_crop_box_parameters_.max_z);
+  int crop_box_blocks_per_grid = std::min(blocks_per_grid, max_blocks_per_grid_);
+  cropBoxKernel<<<crop_box_blocks_per_grid, threads_per_block_, 0, stream_>>>(
+    device_transformed_points, device_crop_mask, num_input_points,
+    thrust::raw_pointer_cast(device_crop_box_structs_.data()), host_crop_box_structs_.size());
 
   if (use_3d_undistortion_ && device_twist_3d_structs_.size() > 0) {
-    undistort3dKernel<<<blocksPerGrid, threadsPerBlock>>>(
+    undistort3dKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>(
       device_transformed_points, num_input_points, device_twist_3d_structs,
       device_twist_3d_structs_.size());
   } else if (!use_3d_undistortion_ && device_twist_2d_structs_.size() > 0) {
-    undistort2dKernel<<<blocksPerGrid, threadsPerBlock>>>(
+    undistort2dKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>(
       device_transformed_points, num_input_points, device_twist_2d_structs,
       device_twist_2d_structs_.size());
   }
 
-  ringOutlierFilterKernel<<<blocksPerGrid, threadsPerBlock>>>(
+  ringOutlierFilterKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>(
     device_transformed_points, device_ring_outlier_mask, max_rings_, max_points_per_ring_,
     ring_outlier_parameters_.distance_ratio,
     ring_outlier_parameters_.object_length_threshold *
       ring_outlier_parameters_.object_length_threshold,
     ring_outlier_parameters_.num_points_threshold);
 
-  combineMasksKernel<<<blocksPerGrid, threadsPerBlock>>>(
-    device_self_crop_mask, device_mirror_crop_mask, device_ring_outlier_mask, num_input_points,
-    device_ring_outlier_mask);
+  combineMasksKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>(
+    device_crop_mask, device_ring_outlier_mask, num_input_points, device_ring_outlier_mask);
+
+  cudaStreamSynchronize(stream_);
 
   thrust::inclusive_scan(
     thrust::device, device_ring_outlier_mask, device_ring_outlier_mask + num_input_points,
@@ -683,7 +693,7 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr
     cudaMemcpyDeviceToHost);
 
   if (num_output_points > 0) {
-    extractInputPointsToOutputPoints_indicesKernel<<<blocksPerGrid, threadsPerBlock>>>(
+    extractInputPointsToOutputPoints_indicesKernel<<<blocks_per_grid, threads_per_block_>>>(
       device_transformed_points, device_ring_outlier_mask, device_indices, num_input_points,
       reinterpret_cast<OutputPointType *>(output_pointcloud_ptr_->data.get()));
   }
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
index fdfae020f1e19..2f5cb881e97fa 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
@@ -52,20 +52,35 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode(
   ring_outlier_filter_parameters.num_points_threshold =
     declare_parameter<int>("num_points_threshold");
 
-  CropBoxParameters self_crop_box_parameters, mirror_crop_box_parameters;
-  self_crop_box_parameters.min_x = declare_parameter<float>("self_crop.min_x");
-  self_crop_box_parameters.min_y = declare_parameter<float>("self_crop.min_y");
-  self_crop_box_parameters.min_z = declare_parameter<float>("self_crop.min_z");
-  self_crop_box_parameters.max_x = declare_parameter<float>("self_crop.max_x");
-  self_crop_box_parameters.max_y = declare_parameter<float>("self_crop.max_y");
-  self_crop_box_parameters.max_z = declare_parameter<float>("self_crop.max_z");
-
-  mirror_crop_box_parameters.min_x = declare_parameter<float>("mirror_crop.min_x");
-  mirror_crop_box_parameters.min_y = declare_parameter<float>("mirror_crop.min_y");
-  mirror_crop_box_parameters.min_z = declare_parameter<float>("mirror_crop.min_z");
-  mirror_crop_box_parameters.max_x = declare_parameter<float>("mirror_crop.max_x");
-  mirror_crop_box_parameters.max_y = declare_parameter<float>("mirror_crop.max_y");
-  mirror_crop_box_parameters.max_z = declare_parameter<float>("mirror_crop.max_z");
+  const auto crop_box_min_x_vector = declare_parameter<std::vector<double>>("crop_box.min_x");
+  const auto crop_box_min_y_vector = declare_parameter<std::vector<double>>("crop_box.min_y");
+  const auto crop_box_min_z_vector = declare_parameter<std::vector<double>>("crop_box.min_z");
+
+  const auto crop_box_max_x_vector = declare_parameter<std::vector<double>>("crop_box.max_x");
+  const auto crop_box_max_y_vector = declare_parameter<std::vector<double>>("crop_box.max_y");
+  const auto crop_box_max_z_vector = declare_parameter<std::vector<double>>("crop_box.max_z");
+
+  if (
+    crop_box_min_x_vector.size() != crop_box_min_y_vector.size() ||
+    crop_box_min_x_vector.size() != crop_box_min_z_vector.size() ||
+    crop_box_min_x_vector.size() != crop_box_max_x_vector.size() ||
+    crop_box_min_x_vector.size() != crop_box_max_y_vector.size() ||
+    crop_box_min_x_vector.size() != crop_box_max_z_vector.size()) {
+    throw std::runtime_error("Crop box parameters must have the same size");
+  }
+
+  std::vector<CropBoxParameters> crop_box_parameters;
+
+  for (std::size_t i = 0; i < crop_box_min_x_vector.size(); i++) {
+    CropBoxParameters parameters;
+    parameters.min_x = crop_box_min_x_vector[i];
+    parameters.min_y = crop_box_min_y_vector[i];
+    parameters.min_z = crop_box_min_z_vector[i];
+    parameters.max_x = crop_box_max_x_vector[i];
+    parameters.max_y = crop_box_max_y_vector[i];
+    parameters.max_z = crop_box_max_z_vector[i];
+    crop_box_parameters.push_back(parameters);
+  }
 
   bool use_3d_undistortion = declare_parameter<bool>("use_3d_distortion_correction");
   bool use_imu = declare_parameter<bool>("use_imu");
@@ -92,8 +107,7 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode(
 
   cuda_pointcloud_preprocessor_ = std::make_unique<CudaPointcloudPreprocessor>();
   cuda_pointcloud_preprocessor_->setRingOutlierFilterParameters(ring_outlier_filter_parameters);
-  cuda_pointcloud_preprocessor_->setCropBoxParameters(
-    self_crop_box_parameters, mirror_crop_box_parameters);
+  cuda_pointcloud_preprocessor_->setCropBoxParameters(crop_box_parameters);
   cuda_pointcloud_preprocessor_->set3DUndistortion(use_3d_undistortion);
 
   // initialize debug tool

From 6baaea2a8da07873a43ab74b1e866576050f3cbb Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Thu, 23 Jan 2025 16:34:21 +0900
Subject: [PATCH 24/38] chore: updated config, schema, and handled the null
 case in a specialized way

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../cuda_pointcloud_preprocessor.param.yaml   | 18 ++--
 .../cuda_pointcloud_preprocessor.schema.json  | 88 ++++++-------------
 .../cuda_pointcloud_preprocessor.cu           | 10 ++-
 3 files changed, 42 insertions(+), 74 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml b/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml
index 22358cfadf2a7..66ce309caa127 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml
@@ -8,15 +8,9 @@
     object_length_threshold: 0.1
     num_points_threshold: 4
 
-    self_crop.min_x: 1.0
-    self_crop.min_y: 1.0
-    self_crop.min_z: 1.0
-    self_crop.max_x: -1.0
-    self_crop.max_y: -1.0
-    self_crop.max_z: -1.0
-    mirror_crop.min_x: 1.0
-    mirror_crop.min_y: 1.0
-    mirror_crop.min_z: 1.0
-    mirror_crop.max_x: -1.0
-    mirror_crop.max_y: -1.0
-    mirror_crop.max_z: -1.0
+    crop_box.min_x: [1.0]
+    crop_box.min_y: [1.0]
+    crop_box.min_z: [1.0]
+    crop_box.max_x: [-1.0]
+    crop_box.max_y: [-1.0]
+    crop_box.max_z: [-1.0]
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json
index cc94da3523512..b8e597b9afdb2 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json
@@ -39,65 +39,35 @@
           "default": "4",
           "minimum": 0
         },
-        "self_crop.min_x": {
-          "type": "number",
-          "description": "minimum x-coordinate value for self cropping in meters",
-          "default": "-1.0"
-        },
-        "self_crop.min_y": {
-          "type": "number",
-          "description": "minimum y-coordinate value for self cropping in meters",
-          "default": "-1.0"
-        },
-        "self_crop.min_z": {
-          "type": "number",
-          "description": "minimum z-coordinate value for self cropping in meters",
-          "default": "-1.0"
-        },
-        "self_crop.max_x": {
-          "type": "number",
-          "description": "maximum x-coordinate value for self cropping in meters",
-          "default": "1.0"
-        },
-        "self_crop.max_y": {
-          "type": "number",
-          "description": "maximum y-coordinate value for self cropping in meters",
-          "default": "1.0"
-        },
-        "self_crop.max_z": {
-          "type": "number",
-          "description": "maximum z-coordinate value for self cropping in meters",
-          "default": "1.0"
-        },
-        "mirror_crop.min_x": {
-          "type": "number",
-          "description": "minimum x-coordinate value for mirror cropping in meters",
-          "default": "-1.0"
-        },
-        "mirror_crop.min_y": {
-          "type": "number",
-          "description": "minimum y-coordinate value for mirror cropping in meters",
-          "default": "-1.0"
-        },
-        "mirror_crop.min_z": {
-          "type": "number",
-          "description": "minimum z-coordinate value for mirror cropping in meters",
-          "default": "-1.0"
-        },
-        "mirror_crop.max_x": {
-          "type": "number",
-          "description": "maximum x-coordinate value for mirror cropping in meters",
-          "default": "1.0"
-        },
-        "mirror_crop.max_y": {
-          "type": "number",
-          "description": "maximum y-coordinate value for mirror cropping in meters",
-          "default": "1.0"
-        },
-        "mirror_crop.max_z": {
-          "type": "number",
-          "description": "maximum z-coordinate value for mirror cropping in meters",
-          "default": "1.0"
+        "crop_box.min_x": {
+          "type": "array",
+          "description": "An array in which each elemt is the minimum x value in a crop box",
+          "default": [1.0]
+        },
+        "crop_box.min_y": {
+          "type": "array",
+          "description": "An array in which each elemt is the minimum y value in a crop box",
+          "default": [1.0]
+        },
+        "crop_box.min_z": {
+          "type": "array",
+          "description": "An array in which each elemt is the minimum z value in a crop box",
+          "default": [1.0]
+        },
+        "crop_box.max_x": {
+          "type": "array",
+          "description": "An array in which each elemt is the maximum x value in a crop box",
+          "default": [-1.0]
+        },
+        "crop_box.max_y": {
+          "type": "array",
+          "description": "An array in which each elemt is the maximum y value in a crop box",
+          "default": [-1.0]
+        },
+        "crop_box.max_z": {
+          "type": "array",
+          "description": "An array in which each elemt is the maximum z value in a crop box",
+          "default": [-1.0]
         }
       },
       "required": [
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index 0e2cd45eda0f8..8d4526377ab44 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -657,9 +657,13 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr
     device_input_points, device_transformed_points, num_input_points, transform_struct);
 
   int crop_box_blocks_per_grid = std::min(blocks_per_grid, max_blocks_per_grid_);
-  cropBoxKernel<<<crop_box_blocks_per_grid, threads_per_block_, 0, stream_>>>(
-    device_transformed_points, device_crop_mask, num_input_points,
-    thrust::raw_pointer_cast(device_crop_box_structs_.data()), host_crop_box_structs_.size());
+  if (host_crop_box_structs_.size() > 0) {
+    cropBoxKernel<<<crop_box_blocks_per_grid, threads_per_block_, 0, stream_>>>(
+      device_transformed_points, device_crop_mask, num_input_points,
+      thrust::raw_pointer_cast(device_crop_box_structs_.data()), host_crop_box_structs_.size());
+  } else {
+    thrust::fill(thrust::device, device_crop_mask, device_crop_mask + num_input_points, 1);
+  }
 
   if (use_3d_undistortion_ && device_twist_3d_structs_.size() > 0) {
     undistort3dKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>(

From 59144d8ce1f9e4a6a33e5dfb493ab26fcefdc111 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Tue, 4 Feb 2025 11:51:13 +0900
Subject: [PATCH 25/38] feat: moving the pointcloud organization into gpu

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../cuda_pointcloud_preprocessor.hpp          |  21 +
 .../cuda_pointcloud_preprocessor_node.hpp     |  50 +-
 .../cuda_pointcloud_preprocessor.cu           | 104 +++++
 .../cuda_pointcloud_preprocessor_node.cpp     | 428 +++++++++++++++++-
 4 files changed, 593 insertions(+), 10 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
index 94c5915d08619..3dbc968d9376a 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
@@ -29,11 +29,32 @@
 #include <thrust/device_vector.h>
 
 #include <cmath>
+#include <cstdint>
 #include <deque>
 
 namespace autoware::cuda_pointcloud_preprocessor
 {
 
+void organizeLaunch(
+  const InputPointType * __restrict__ input_points, std::uint32_t * index_tensor,
+  std::int32_t * ring_indexes, std::int32_t initial_max_rings, std::int32_t * output_max_rings,
+  std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring,
+  int num_points, cudaStream_t & stream);
+
+std::size_t querySortWorkspace(
+  int num_items, int num_segments, int * d_offsets, std::uint32_t * d_keys_in,
+  std::uint32_t * d_keys_out);
+
+void sortLaunch(
+  int num_items, int num_segments, int * d_offsets, std::uint32_t * d_keys_in,
+  std::uint32_t * d_keys_out, std::uint8_t * d_temp_storage, std::size_t temp_storage_bytes,
+  cudaStream_t & stream);  // e.g., [-, -, -, -, -, -, -]
+
+void gatherLaunch(
+  const InputPointType * __restrict__ input_points, const std::uint32_t * __restrict__ index_tensor,
+  InputPointType * __restrict__ output_points, int num_rings, int max_points_per_ring,
+  cudaStream_t & stream);
+
 struct TwistStruct2D
 {
   float cum_x;
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
index a5c5361c92515..6a45a79b63c65 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
@@ -81,8 +81,17 @@ class CudaPointcloudPreprocessorNode : public rclcpp::Node
     const std::string & target_frame, const std::string & source_frame,
     tf2::Transform * tf2_transform_ptr);
 
+  void estimatePointcloudRingInfo(
+    const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr);
+
+  bool orderPointcloud(
+    const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr);
+
   // Callback
-  void pointcloudCallback(const std::shared_ptr<const cuda_blackboard::CudaPointCloud2> cuda_msg);
+  void pointcloudCallback(
+    const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr);
+  void cudaPointcloudCallback(
+    const std::shared_ptr<const cuda_blackboard::CudaPointCloud2> cuda_msg);
   void twistCallback(
     const geometry_msgs::msg::TwistWithCovarianceStamped::ConstSharedPtr pointcloud_msg);
   void imuCallback(const sensor_msgs::msg::Imu::ConstSharedPtr imu_msg);
@@ -94,18 +103,55 @@ class CudaPointcloudPreprocessorNode : public rclcpp::Node
   std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> twist_queue_;
   std::deque<geometry_msgs::msg::Vector3Stamped> angular_velocity_queue_;
 
+  std::size_t num_rings_{0};
+  std::size_t max_points_per_ring_{0};
+  std::vector<std::size_t> next_ring_index_;
+
   // Subscriber
   rclcpp::Subscription<geometry_msgs::msg::TwistWithCovarianceStamped>::SharedPtr twist_sub_{};
   rclcpp::Subscription<sensor_msgs::msg::Imu>::SharedPtr imu_sub_{};
+  rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::SharedPtr pointcloud_sub_{};
+
+  cuda_blackboard::CudaUniquePtr<InputPointType[]> points_device_;
+  cuda_blackboard::CudaUniquePtr<InputPointType[]> organized_points_device_;
+  cuda_blackboard::CudaUniquePtr<std::int32_t[]> ring_index_device_;
+  cuda_blackboard::CudaUniquePtr<std::uint32_t[]> indexes_tensor_device_;
+  cuda_blackboard::CudaUniquePtr<std::uint32_t[]> sorted_indexes_tensor_device_;
+  cuda_blackboard::CudaUniquePtr<std::int32_t[]> segment_offsets_device_;
+  cuda_blackboard::CudaUniquePtr<std::int32_t> max_rings_device_;
+  cuda_blackboard::CudaUniquePtr<std::int32_t> max_points_per_ring_device_;
+
+  cuda_blackboard::CudaUniquePtr<std::uint8_t[]> sort_workspace_device_;
+  std::size_t sort_workspace_bytes_{0};
+
+  std::int32_t test_num_rings_{0};
+  std::int32_t test_max_points_per_ring_{0};
+  cudaStream_t stream_;
+  std::int32_t capacity_{0};
+
+  cuda_blackboard::HostUniquePtr<autoware::point_types::PointXYZIRCAEDT[]> host_buffer_;
+  // cuda_blackboard::CudaUniquePtr<std::uint8_t[]> device_buffer_; TODO(knzo25) replaced by a
+  // CudaPointCloud
+  std::shared_ptr<cuda_blackboard::CudaPointCloud2> input_pointcloud_ptr_;
 
   // CUDA pub & sub
   std::unique_ptr<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>> pub_;
-  std::unique_ptr<cuda_blackboard::CudaBlackboardSubscriber<cuda_blackboard::CudaPointCloud2>> sub_;
+  // std::unique_ptr<cuda_blackboard::CudaBlackboardSubscriber<cuda_blackboard::CudaPointCloud2>>
+  // sub_;
 
   std::unique_ptr<CudaPointcloudPreprocessor> cuda_pointcloud_preprocessor_;
 
   std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_;
   std::unique_ptr<autoware::universe_utils::DebugPublisher> debug_publisher_;
+
+  double organize_time_cuda_ms_{0.0};
+  double copy_time_cuda_ms_{0.0};
+
+  double organize_time_ms_{0.0};
+  double copy_time_ms_{0.0};
+  double processing_time_ms_{0.0};
+  double kernel_time_ms_{0.0};
+  int iteration_{0};
 };
 
 }  // namespace autoware::cuda_pointcloud_preprocessor
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index 8d4526377ab44..d99d384b0c9ab 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -18,6 +18,7 @@
 #include <Eigen/Core>
 #include <Eigen/Dense>
 #include <Eigen/Geometry>
+#include <cub/cub.cuh>
 
 #include <cuda_runtime.h>
 #include <tf2/utils.h>
@@ -25,6 +26,7 @@
 #include <thrust/host_vector.h>
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
+// #include <cub/device/device_segmented_radix_sort.cuh>
 
 namespace autoware::cuda_pointcloud_preprocessor
 {
@@ -70,6 +72,108 @@ __host__ __device__ Eigen::Matrix4f transformationMatrixFromVelocity(
   return transformation;
 }
 
+__global__ void organizeKernel(
+  const InputPointType * __restrict__ input_points, std::uint32_t * index_tensor,
+  std::int32_t * ring_indexes, std::int32_t initial_max_rings, std::int32_t * output_max_rings,
+  std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring,
+  int num_points)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= num_points) {
+    return;
+  }
+
+  auto ring = input_points[idx].channel;
+
+  if (ring >= initial_max_rings) {
+    atomicMax(output_max_rings, ring);
+    return;
+  }
+
+  int next_offset = atomicAdd(&ring_indexes[ring], 1);
+
+  if (next_offset >= initial_max_points_per_ring) {
+    atomicMax(output_max_points_per_ring, next_offset);
+    return;
+  }
+
+  index_tensor[ring * initial_max_points_per_ring + next_offset] = idx;
+}
+
+void organizeLaunch(
+  const InputPointType * __restrict__ input_points, std::uint32_t * index_tensor,
+  std::int32_t * ring_indexes, std::int32_t initial_max_rings, std::int32_t * output_max_rings,
+  std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring,
+  int num_points, cudaStream_t & stream)
+{
+  int threads_per_block = 256;
+  int blocks_per_grid = (num_points + threads_per_block - 1) / threads_per_block;
+  organizeKernel<<<blocks_per_grid, threads_per_block>>>(
+    input_points, index_tensor, ring_indexes, initial_max_rings, output_max_rings,
+    initial_max_points_per_ring, output_max_points_per_ring, num_points);
+}
+
+std::size_t querySortWorkspace(
+  int num_items, int num_segments, int * d_offsets, std::uint32_t * d_keys_in,
+  std::uint32_t * d_keys_out)
+{
+  // Determine temporary device storage requirements
+  void * d_temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceSegmentedRadixSort::SortKeys(
+    d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_offsets,
+    d_offsets + 1);
+
+  return temp_storage_bytes;
+}
+
+void sortLaunch(
+  int num_items, int num_segments, int * d_offsets, std::uint32_t * d_keys_in,
+  std::uint32_t * d_keys_out, std::uint8_t * d_temp_storage, std::size_t temp_storage_bytes,
+  cudaStream_t & stream)
+{
+  // Allocate temporary storage
+  // cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+  // Run sorting operation
+  cub::DeviceSegmentedRadixSort::SortKeys(
+    reinterpret_cast<void *>(d_temp_storage), temp_storage_bytes, d_keys_in, d_keys_out, num_items,
+    num_segments, d_offsets, d_offsets + 1, 0, sizeof(std::uint32_t) * 8, stream);
+}
+
+__global__ void gatherKernel(
+  const InputPointType * __restrict__ input_points, const std::uint32_t * __restrict__ index_tensor,
+  InputPointType * __restrict__ output_points, int num_rings, int max_points_per_ring)
+{
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= num_rings * max_points_per_ring) {
+    return;
+  }
+
+  const int ring = idx / max_points_per_ring;
+  const int point = idx % max_points_per_ring;
+
+  const std::uint32_t input_idx = index_tensor[ring * max_points_per_ring + point];
+
+  if (input_idx < std::numeric_limits<std::uint32_t>::max()) {
+    output_points[ring * max_points_per_ring + point] = input_points[input_idx];
+  } else {
+    output_points[ring * max_points_per_ring + point].distance = 0.0f;
+  }
+}
+
+void gatherLaunch(
+  const InputPointType * __restrict__ input_points, const std::uint32_t * __restrict__ index_tensor,
+  InputPointType * __restrict__ output_points, int num_rings, int max_points_per_ring,
+  cudaStream_t & stream)
+{
+  int threads_per_block = 256;
+  int blocks_per_grid =
+    (num_rings * max_points_per_ring + threads_per_block - 1) / threads_per_block;
+  gatherKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+    input_points, index_tensor, output_points, num_rings, max_points_per_ring);
+}
+
 __global__ void transformPointsKernel(
   const InputPointType * __restrict__ input_points, InputPointType * output_points, int num_points,
   TransformStruct transform)
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
index 2f5cb881e97fa..1aa634d8807dd 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
@@ -27,6 +27,8 @@
 #include <pcl/point_types.h>
 
 #include <chrono>
+#include <chrono>  // TODO(knzo25): remove this
+#include <limits>  // TODO(knzo25): remove this
 #include <string>
 #include <vector>
 
@@ -85,24 +87,29 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode(
   bool use_3d_undistortion = declare_parameter<bool>("use_3d_distortion_correction");
   bool use_imu = declare_parameter<bool>("use_imu");
 
-  // Subscriber
-  sub_ =
-    std::make_unique<cuda_blackboard::CudaBlackboardSubscriber<cuda_blackboard::CudaPointCloud2>>(
-      *this, "~/input/pointcloud", false,
-      std::bind(&CudaPointcloudPreprocessorNode::pointcloudCallback, this, _1));
-
   // Publisher
   pub_ =
     std::make_unique<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>>(
       *this, "~/output/pointcloud");
+
+  // Subscriber
+  rclcpp::SubscriptionOptions sub_options;
+  sub_options.use_intra_process_comm = rclcpp::IntraProcessSetting::Enable;
+
+  pointcloud_sub_ = this->create_subscription<sensor_msgs::msg::PointCloud2>(
+    "~/input/pointcloud", rclcpp::SensorDataQoS{}.keep_last(1),
+    std::bind(&CudaPointcloudPreprocessorNode::pointcloudCallback, this, std::placeholders::_1),
+    sub_options);
   twist_sub_ = this->create_subscription<geometry_msgs::msg::TwistWithCovarianceStamped>(
     "~/input/twist", 10,
-    std::bind(&CudaPointcloudPreprocessorNode::twistCallback, this, std::placeholders::_1));
+    std::bind(&CudaPointcloudPreprocessorNode::twistCallback, this, std::placeholders::_1),
+    sub_options);
 
   if (use_imu) {
     imu_sub_ = this->create_subscription<sensor_msgs::msg::Imu>(
       "~/input/imu", 10,
-      std::bind(&CudaPointcloudPreprocessorNode::imuCallback, this, std::placeholders::_1));
+      std::bind(&CudaPointcloudPreprocessorNode::imuCallback, this, std::placeholders::_1),
+      sub_options);
   }
 
   cuda_pointcloud_preprocessor_ = std::make_unique<CudaPointcloudPreprocessor>();
@@ -119,6 +126,32 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode(
     debug_publisher_ = std::make_unique<DebugPublisher>(this, "cuda_pointcloud_preprocessor");
     stop_watch_ptr_->tic("processing_time");
   }
+
+  // New organized pointcloud method
+  test_num_rings_ = 1;
+  test_max_points_per_ring_ = 1;
+  ring_index_device_ = cuda_blackboard::make_unique<std::int32_t[]>(test_num_rings_);
+  indexes_tensor_device_ =
+    cuda_blackboard::make_unique<std::uint32_t[]>(test_num_rings_ * test_max_points_per_ring_);
+  sorted_indexes_tensor_device_ =
+    cuda_blackboard::make_unique<std::uint32_t[]>(test_num_rings_ * test_max_points_per_ring_);
+  segment_offsets_device_ = cuda_blackboard::make_unique<std::int32_t[]>(test_num_rings_ + 1);
+  max_rings_device_ = cuda_blackboard::make_unique<std::int32_t>();
+  max_points_per_ring_device_ = cuda_blackboard::make_unique<std::int32_t>();
+  organized_points_device_ =
+    cuda_blackboard::make_unique<InputPointType[]>(test_num_rings_ * test_max_points_per_ring_);
+
+  cudaStreamCreate(&stream_);
+  cudaMemsetAsync(max_rings_device_.get(), 0, sizeof(std::int32_t), stream_);
+  cudaMemsetAsync(max_points_per_ring_device_.get(), 0, sizeof(std::int32_t), stream_);
+
+  sort_workspace_bytes_ = querySortWorkspace(
+    test_num_rings_ * test_max_points_per_ring_, test_num_rings_, segment_offsets_device_.get(),
+    indexes_tensor_device_.get(), sorted_indexes_tensor_device_.get());
+  sort_workspace_device_ = cuda_blackboard::make_unique<std::uint8_t[]>(sort_workspace_bytes_);
+
+  capacity_ = 8192;
+  points_device_ = cuda_blackboard::make_unique<InputPointType[]>(capacity_);
 }
 
 bool CudaPointcloudPreprocessorNode::getTransform(
@@ -198,9 +231,356 @@ void CudaPointcloudPreprocessorNode::imuCallback(
   }
 }
 
+void CudaPointcloudPreprocessorNode::estimatePointcloudRingInfo(
+  const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr)
+{
+  const autoware::point_types::PointXYZIRCAEDT * input_buffer =
+    reinterpret_cast<const autoware::point_types::PointXYZIRCAEDT *>(
+      input_pointcloud_msg_ptr->data.data());
+
+  std::size_t max_ring = 0;
+
+  for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
+       i++) {
+    const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i];
+    const std::size_t ring = static_cast<std::size_t>(point.channel);
+    max_ring = std::max(max_ring, ring);
+  }
+
+  // Set max rings to the next power of two
+  num_rings_ = std::pow(2, std::ceil(std::log2(max_ring + 1)));
+  num_rings_ = std::max(num_rings_, static_cast<std::size_t>(16));
+  std::vector<std::size_t> ring_points(num_rings_, 0);
+
+  for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
+       i++) {
+    const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i];
+    const std::size_t ring = point.channel;
+    ring_points[ring]++;
+  }
+
+  // Set max points per ring to the next multiple of 512
+  max_points_per_ring_ = *std::max_element(ring_points.begin(), ring_points.end());
+  max_points_per_ring_ = std::max(max_points_per_ring_, static_cast<std::size_t>(512));
+  max_points_per_ring_ = (max_points_per_ring_ + 511) / 512 * 512;
+
+  next_ring_index_.resize(num_rings_);
+  std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0);
+  host_buffer_ = cuda_blackboard::make_host_unique<autoware::point_types::PointXYZIRCAEDT[]>(
+    num_rings_ * max_points_per_ring_);
+
+  input_pointcloud_ptr_ = std::make_shared<cuda_blackboard::CudaPointCloud2>();
+  input_pointcloud_ptr_->data = cuda_blackboard::make_unique<std::uint8_t[]>(
+    num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT));
+  input_pointcloud_ptr_->width = max_points_per_ring_;
+  input_pointcloud_ptr_->height = num_rings_;
+  input_pointcloud_ptr_->point_step = sizeof(autoware::point_types::PointXYZIRCAEDT);
+  input_pointcloud_ptr_->row_step =
+    max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT);
+  input_pointcloud_ptr_->is_dense = input_pointcloud_msg_ptr->is_dense;
+  input_pointcloud_ptr_->fields = input_pointcloud_msg_ptr->fields;
+  input_pointcloud_ptr_->header = input_pointcloud_msg_ptr->header;
+
+  RCLCPP_INFO_STREAM(
+    get_logger(), "Estimated rings: " << num_rings_
+                                      << ", max_points_per_ring: " << max_points_per_ring_
+                                      << ". This should only be done during the first iterations. "
+                                         "Otherwise, performance will be affected.");
+}
+
+bool CudaPointcloudPreprocessorNode::orderPointcloud(
+  const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr)
+{
+  const autoware::point_types::PointXYZIRCAEDT * input_buffer =
+    reinterpret_cast<const autoware::point_types::PointXYZIRCAEDT *>(
+      input_pointcloud_msg_ptr->data.data());
+
+  bool ring_overflow = false;
+  bool point_overflow = false;
+
+  autoware::point_types::PointXYZIRCAEDT * buffer = host_buffer_.get();
+
+  for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
+       i++) {
+    const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i];
+    const std::size_t raw_ring = point.channel;
+    const std::size_t ring = raw_ring % num_rings_;
+
+    const std::size_t raw_index = next_ring_index_[ring];
+    const std::size_t index = raw_index % max_points_per_ring_;
+
+    ring_overflow |= raw_ring >= num_rings_;
+    point_overflow |= raw_index >= max_points_per_ring_;
+
+    buffer[ring * max_points_per_ring_ + index] = point;
+    next_ring_index_[ring] = raw_index + 1;
+  }
+
+  return !ring_overflow && !point_overflow;
+}
+
 void CudaPointcloudPreprocessorNode::pointcloudCallback(
+  const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr)
+{
+  // NEW Cuda version
+  if (input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height > capacity_) {
+    capacity_ = input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
+    capacity_ = (capacity_ + 1024) / 1024 * 1024;
+    points_device_ = cuda_blackboard::make_unique<InputPointType[]>(capacity_);
+  }
+
+  double cuda_t0 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
+                            std::chrono::high_resolution_clock::now().time_since_epoch())
+                            .count();
+
+  cudaMemcpyAsync(
+    points_device_.get(), input_pointcloud_msg_ptr->data.data(),
+    input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height * sizeof(InputPointType),
+    cudaMemcpyHostToDevice, stream_);
+  cudaStreamSynchronize(stream_);
+
+  double cuda_t1 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
+                            std::chrono::high_resolution_clock::now().time_since_epoch())
+                            .count();
+
+  // Estimate the number of rings and max points per ring
+  cudaMemsetAsync(ring_index_device_.get(), 0, test_num_rings_ * sizeof(std::int32_t), stream_);
+  cudaMemsetAsync(
+    indexes_tensor_device_.get(), 0xFF,
+    test_num_rings_ * test_max_points_per_ring_ * sizeof(std::uint32_t), stream_);
+  organizeLaunch(
+    points_device_.get(), indexes_tensor_device_.get(), ring_index_device_.get(), test_num_rings_,
+    max_rings_device_.get(), test_max_points_per_ring_, max_points_per_ring_device_.get(),
+    input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height, stream_);
+  cudaStreamSynchronize(stream_);
+
+  std::int32_t max_ring_value;
+  std::int32_t max_points_per_ring;
+  cudaMemcpyAsync(
+    &max_ring_value, max_rings_device_.get(), sizeof(std::int32_t), cudaMemcpyDeviceToHost,
+    stream_);
+  cudaMemcpyAsync(
+    &max_points_per_ring, max_points_per_ring_device_.get(), sizeof(std::int32_t),
+    cudaMemcpyDeviceToHost, stream_);
+  cudaStreamSynchronize(stream_);
+
+  if (max_ring_value >= test_num_rings_ || max_points_per_ring > test_max_points_per_ring_) {
+    RCLCPP_WARN(
+      get_logger(), "Current ring num: %d, Current Max points per ring: %d", test_num_rings_,
+      test_max_points_per_ring_);
+    test_num_rings_ = max_ring_value + 1;
+    test_max_points_per_ring_ =
+      std::max((max_points_per_ring + 511) / 512 * 512, 512);  // Re think this equation
+    ring_index_device_ = cuda_blackboard::make_unique<std::int32_t[]>(test_num_rings_);
+    indexes_tensor_device_ =
+      cuda_blackboard::make_unique<std::uint32_t[]>(test_num_rings_ * test_max_points_per_ring_);
+    sorted_indexes_tensor_device_ =
+      cuda_blackboard::make_unique<std::uint32_t[]>(test_num_rings_ * test_max_points_per_ring_);
+    segment_offsets_device_ = cuda_blackboard::make_unique<std::int32_t[]>(test_num_rings_ + 1);
+    organized_points_device_ =
+      cuda_blackboard::make_unique<InputPointType[]>(test_num_rings_ * test_max_points_per_ring_);
+    RCLCPP_WARN(
+      get_logger(), "NEW Max ring value: %d, Max points per ring: %d", max_ring_value,
+      test_max_points_per_ring_);
+
+    std::vector<std::int32_t> segment_offsets_host(test_num_rings_ + 1);
+    for (std::size_t i = 0; i < test_num_rings_ + 1; i++) {
+      segment_offsets_host[i] = i * test_max_points_per_ring_;
+    }
+    cudaMemcpyAsync(
+      segment_offsets_device_.get(), segment_offsets_host.data(),
+      (test_num_rings_ + 1) * sizeof(std::int32_t), cudaMemcpyHostToDevice, stream_);
+
+    cudaMemsetAsync(ring_index_device_.get(), 0, test_num_rings_ * sizeof(std::int32_t), stream_);
+    cudaMemsetAsync(
+      indexes_tensor_device_.get(), 0xFF,
+      test_num_rings_ * test_max_points_per_ring_ * sizeof(std::int32_t), stream_);
+
+    sort_workspace_bytes_ = querySortWorkspace(
+      test_num_rings_ * test_max_points_per_ring_, test_num_rings_, segment_offsets_device_.get(),
+      indexes_tensor_device_.get(), sorted_indexes_tensor_device_.get());
+    sort_workspace_device_ = cuda_blackboard::make_unique<std::uint8_t[]>(sort_workspace_bytes_);
+
+    organizeLaunch(
+      points_device_.get(), indexes_tensor_device_.get(), ring_index_device_.get(), test_num_rings_,
+      max_rings_device_.get(), test_max_points_per_ring_, max_points_per_ring_device_.get(),
+      input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height, stream_);
+    // cudaStreamSynchronize(stream_);
+  }
+
+  /* std::vector<std::int32_t> ring_index_host(test_num_rings_);
+  cudaMemcpyAsync(
+    ring_index_host.data(), ring_index_device_.get(), test_num_rings_ * sizeof(std::int32_t),
+  cudaMemcpyDeviceToHost, stream_); cudaStreamSynchronize(stream_);
+
+  for (std::size_t i = 0; i < test_num_rings_; i++) {
+    if (ring_index_host[i] > 0) {
+      std::cout << "Ring id=" << i << " has " << ring_index_host[i] << " points" << std::endl;
+    }
+  } */
+
+  sortLaunch(
+    test_num_rings_ * test_max_points_per_ring_, test_num_rings_, segment_offsets_device_.get(),
+    indexes_tensor_device_.get(), sorted_indexes_tensor_device_.get(), sort_workspace_device_.get(),
+    sort_workspace_bytes_, stream_);
+  cudaStreamSynchronize(stream_);
+
+  /* std::vector<std::uint32_t> indexes_tensor_host(test_num_rings_ * test_max_points_per_ring_);
+  cudaMemcpyAsync(
+    indexes_tensor_host.data(), sorted_indexes_tensor_device_.get(),
+    test_num_rings_ * test_max_points_per_ring_ * sizeof(std::uint32_t), cudaMemcpyDeviceToHost,
+  stream_); cudaStreamSynchronize(stream_);
+
+  if (test_num_rings_ < 64) {
+    for (std::size_t i = 0; i < test_num_rings_; i++) {
+      std::cout << "Ring id=" << i << "\n\t";
+      for (std::size_t j = 0; j < test_max_points_per_ring_; j++) {
+        if (indexes_tensor_host[i * test_max_points_per_ring_ + j] <
+  std::numeric_limits<std::uint32_t>::max()) { std::cout << indexes_tensor_host[i *
+  test_max_points_per_ring_ + j] << " ";
+        }
+      }
+      std::cout << std::endl << std::flush;
+    }
+  } */
+
+  gatherLaunch(
+    points_device_.get(), sorted_indexes_tensor_device_.get(), organized_points_device_.get(),
+    test_num_rings_, test_max_points_per_ring_, stream_);
+  cudaStreamSynchronize(
+    stream_);  // this will not be needed in the final implementation. for now we benchmark
+
+  double cuda_t2 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
+                            std::chrono::high_resolution_clock::now().time_since_epoch())
+                            .count();
+
+  if (iteration_ >= 0) {
+    organize_time_cuda_ms_ += (cuda_t1 - cuda_t0);
+    copy_time_cuda_ms_ += (cuda_t2 - cuda_t1);
+  }
+
+  stop_watch_ptr_->toc("processing_time", true);
+
+  // TODO(knzo25): check the pointcloud layout at least once
+
+  assert(input_pointcloud_msg_ptr->point_step == sizeof(autoware::point_types::PointXYZIRCAEDT));
+
+  if (num_rings_ == 0 || max_points_per_ring_ == 0) {
+    estimatePointcloudRingInfo(input_pointcloud_msg_ptr);
+  }
+
+  double t1 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
+                       std::chrono::high_resolution_clock::now().time_since_epoch())
+                       .count();
+  if (!orderPointcloud(input_pointcloud_msg_ptr)) {
+    estimatePointcloudRingInfo(input_pointcloud_msg_ptr);
+    orderPointcloud(input_pointcloud_msg_ptr);
+  }
+  double t2 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
+                       std::chrono::high_resolution_clock::now().time_since_epoch())
+                       .count();
+
+  // Copy to cuda memory
+  cudaMemcpy(
+    input_pointcloud_ptr_->data.get(), host_buffer_.get(),
+    num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT),
+    cudaMemcpyHostToDevice);
+
+  double t3 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
+                       std::chrono::high_resolution_clock::now().time_since_epoch())
+                       .count();
+
+  if (iteration_ >= 0) {
+    organize_time_ms_ += (t2 - t1);
+    copy_time_ms_ += (t3 - t2);
+  }
+
+  // Test -> this would replace the cpu version
+  cudaMemcpyAsync(
+    input_pointcloud_ptr_->data.get(), organized_points_device_.get(),
+    test_num_rings_ * test_max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT),
+    cudaMemcpyHostToDevice, stream_);
+  input_pointcloud_ptr_->height = test_num_rings_;
+  input_pointcloud_ptr_->width = test_max_points_per_ring_;
+  cudaStreamSynchronize(stream_);
+
+  // TEST CODE
+  {
+    std::vector<autoware::point_types::PointXYZIRCAEDT> host_buffer_test(
+      test_num_rings_ * test_max_points_per_ring_);
+    cudaMemcpyAsync(
+      host_buffer_test.data(), organized_points_device_.get(),
+      test_num_rings_ * test_max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT),
+      cudaMemcpyDeviceToHost, stream_);
+    cudaStreamSynchronize(stream_);
+
+    if (
+      test_num_rings_ > 1 && test_max_points_per_ring_ == max_points_per_ring_ &&
+      test_num_rings_ == num_rings_) {
+      for (std::size_t i = 0; i < num_rings_; i++) {
+        for (std::size_t j = 0; j < max_points_per_ring_; j++) {
+          // Compare the implementations
+          if (
+            (host_buffer_.get()[i * max_points_per_ring_ + j].x !=
+               host_buffer_test[i * max_points_per_ring_ + j].x ||
+             host_buffer_.get()[i * max_points_per_ring_ + j].y !=
+               host_buffer_test[i * max_points_per_ring_ + j].y ||
+             host_buffer_.get()[i * max_points_per_ring_ + j].z !=
+               host_buffer_test[i * max_points_per_ring_ + j].z) &&
+            host_buffer_test[i * max_points_per_ring_ + j].distance > 0) {
+            RCLCPP_ERROR(get_logger(), "Mismatch at ring=%d, point=%d", i, j);
+            RCLCPP_ERROR(
+              get_logger(), "\thost_buffer_test: x=%.2f y=%.2f z=%.2f channel=%d distance=%.2f",
+              host_buffer_test[i * max_points_per_ring_ + j].x,
+              host_buffer_test[i * max_points_per_ring_ + j].y,
+              host_buffer_test[i * max_points_per_ring_ + j].z,
+              host_buffer_test[i * max_points_per_ring_ + j].channel,
+              host_buffer_test[i * max_points_per_ring_ + j].distance);
+            RCLCPP_ERROR(
+              get_logger(), "\thost_buffer: x=%.2f y=%.2f z=%.2f channel=%d distance=%.2f",
+              host_buffer_.get()[i * max_points_per_ring_ + j].x,
+              host_buffer_.get()[i * max_points_per_ring_ + j].y,
+              host_buffer_.get()[i * max_points_per_ring_ + j].z,
+              host_buffer_.get()[i * max_points_per_ring_ + j].channel,
+              host_buffer_.get()[i * max_points_per_ring_ + j].distance);
+          }
+        }
+      }
+    }
+  }
+
+  input_pointcloud_ptr_->header = input_pointcloud_msg_ptr->header;
+
+  cudaPointcloudCallback(input_pointcloud_ptr_);
+
+  if (debug_publisher_) {
+    const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true);
+    debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>(
+      "debug/processing_time_ms", processing_time_ms);
+
+    double now_stamp_seconds = rclcpp::Time(this->get_clock()->now()).seconds();
+    double cloud_stamp_seconds = rclcpp::Time(input_pointcloud_msg_ptr->header.stamp).seconds();
+
+    debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>(
+      "debug/latency_ms", 1000.f * (now_stamp_seconds - cloud_stamp_seconds));
+  }
+
+  // Clear indexes
+  std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0);
+
+  // Clear pointcloud buffer
+  auto host_buffer = host_buffer_.get();
+  for (std::size_t i = 0; i < num_rings_ * max_points_per_ring_; i++) {
+    host_buffer[i] = autoware::point_types::PointXYZIRCAEDT{};
+  }
+}
+
+void CudaPointcloudPreprocessorNode::cudaPointcloudCallback(
   const std::shared_ptr<const cuda_blackboard::CudaPointCloud2> input_pointcloud_msg_ptr)
 {
+  double t1 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
+                       std::chrono::high_resolution_clock::now().time_since_epoch())
+                       .count();
   static_assert(
     sizeof(InputPointType) == sizeof(autoware::point_types::PointXYZIRCAEDT),
     "PointStruct and PointXYZIRCAEDT must have the same size");
@@ -237,10 +617,42 @@ void CudaPointcloudPreprocessorNode::pointcloudCallback(
     return;
   }
 
+  double t2 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
+                       std::chrono::high_resolution_clock::now().time_since_epoch())
+                       .count();
+
   auto output_pointcloud_ptr = cuda_pointcloud_preprocessor_->process(
     *input_pointcloud_msg_ptr, transform_msg, twist_queue_, angular_velocity_queue_);
   output_pointcloud_ptr->header.frame_id = base_frame_;
 
+  double t3 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
+                       std::chrono::high_resolution_clock::now().time_since_epoch())
+                       .count();
+
+  if (iteration_ >= 0) {
+    processing_time_ms_ += (t2 - t1);
+    kernel_time_ms_ += (t3 - t2);
+  }
+  iteration_++;
+
+  if (num_rings_ > 64) {
+    RCLCPP_INFO(
+      get_logger(),
+      "Organize time: %f ms, Copy time: %f ms, Process time: %f ms, Kernel time: %f ms Total time: "
+      "%f ms",
+      organize_time_ms_ / iteration_, copy_time_ms_ / iteration_, processing_time_ms_ / iteration_,
+      kernel_time_ms_ / iteration_,
+      (organize_time_ms_ + copy_time_ms_ + processing_time_ms_ + kernel_time_ms_) / iteration_);
+    RCLCPP_INFO(
+      get_logger(),
+      "\t Cuda Organize time: %f ms, Cuda Copy time: %f ms, Cuda Process time: %f ms, Cuda Kernel "
+      "time: %f ms Total time: %f ms",
+      organize_time_cuda_ms_ / iteration_, copy_time_cuda_ms_ / iteration_,
+      processing_time_ms_ / iteration_, kernel_time_ms_ / iteration_,
+      (organize_time_cuda_ms_ + copy_time_cuda_ms_ + processing_time_ms_ + kernel_time_ms_) /
+        iteration_);
+  }
+
   // Publish
   pub_->publish(std::move(output_pointcloud_ptr));
 

From 5482f9c4cd7d9764e80eb937ddf73e2ca702d76e Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Wed, 5 Feb 2025 15:47:33 +0900
Subject: [PATCH 26/38] feat: reimplemented the organized pointcloud adapter in
 cuda. the only bottleneck is the H->D copy

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../CMakeLists.txt                            |  13 -
 ...cuda_organized_pointcloud_adapter_node.hpp |  76 ---
 .../cuda_pointcloud_preprocessor.hpp          |  68 +--
 .../cuda_pointcloud_preprocessor_node.hpp     |  48 +-
 ...cuda_organized_pointcloud_adapter_node.cpp | 200 --------
 .../cuda_pointcloud_preprocessor.cu           | 434 ++++++++++++------
 .../cuda_pointcloud_preprocessor_node.cpp     | 422 +----------------
 7 files changed, 331 insertions(+), 930 deletions(-)
 delete mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp
 delete mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
index d09f787a3b277..8abff042d6a3a 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
@@ -54,14 +54,6 @@ target_include_directories(cuda_pointcloud_preprocessor_lib SYSTEM PRIVATE
 )
 
 # Targets
-ament_auto_add_library(cuda_organized_pointcloud_adapter SHARED
-  src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
-)
-
-target_link_libraries(cuda_organized_pointcloud_adapter
-  ${CUDA_LIBRARIES}
-)
-
 ament_auto_add_library(cuda_pointcloud_preprocessor SHARED
   src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
 )
@@ -71,11 +63,6 @@ target_link_libraries(cuda_pointcloud_preprocessor
   cuda_pointcloud_preprocessor_lib
 )
 
-rclcpp_components_register_node(cuda_organized_pointcloud_adapter
-  PLUGIN "autoware::cuda_organized_pointcloud_adapter::CudaOrganizedPointcloudAdapterNode"
-  EXECUTABLE cuda_organized_pointcloud_adapter_node
-)
-
 rclcpp_components_register_node(cuda_pointcloud_preprocessor
   PLUGIN "autoware::cuda_pointcloud_preprocessor::CudaPointcloudPreprocessorNode"
   EXECUTABLE cuda_pointcloud_preprocessor_node
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp
deleted file mode 100644
index 7a31442f26f1d..0000000000000
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright 2024 TIER IV, Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_
-#define AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_
-
-#include <autoware/point_types/types.hpp>
-#include <autoware/universe_utils/ros/debug_publisher.hpp>
-#include <autoware/universe_utils/system/stop_watch.hpp>
-#include <cuda_blackboard/cuda_adaptation.hpp>
-#include <cuda_blackboard/cuda_blackboard_publisher.hpp>
-#include <cuda_blackboard/cuda_pointcloud2.hpp>
-#include <rclcpp/rclcpp.hpp>
-
-#include <sensor_msgs/msg/point_cloud2.hpp>
-
-#include <tf2/transform_datatypes.h>
-#include <tf2_ros/buffer.h>
-#include <tf2_ros/transform_listener.h>
-
-#include <chrono>
-#include <deque>
-#include <memory>
-#include <vector>
-
-namespace autoware::cuda_organized_pointcloud_adapter
-{
-
-class CudaOrganizedPointcloudAdapterNode : public rclcpp::Node
-{
-public:
-  explicit CudaOrganizedPointcloudAdapterNode(const rclcpp::NodeOptions & node_options);
-  ~CudaOrganizedPointcloudAdapterNode() = default;
-
-private:
-  void estimatePointcloudRingInfo(
-    const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr);
-
-  bool orderPointcloud(
-    const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr);
-
-  // Callback
-  void pointcloudCallback(
-    const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr);
-
-  // Subscriber
-  rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::SharedPtr pointcloud_sub_{};
-
-  // CUDA pub
-  std::unique_ptr<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>> pub_;
-
-  std::size_t num_rings_{0};
-  std::size_t max_points_per_ring_{0};
-
-  std::vector<std::size_t> next_ring_index_;
-  cuda_blackboard::HostUniquePtr<autoware::point_types::PointXYZIRCAEDT[]> host_buffer_;
-  cuda_blackboard::CudaUniquePtr<std::uint8_t[]> device_buffer_;
-
-  std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_;
-  std::unique_ptr<autoware::universe_utils::DebugPublisher> debug_publisher_;
-};
-
-}  // namespace autoware::cuda_organized_pointcloud_adapter
-
-#endif  // AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
index 3dbc968d9376a..7c4c0113d3618 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
@@ -35,26 +35,6 @@
 namespace autoware::cuda_pointcloud_preprocessor
 {
 
-void organizeLaunch(
-  const InputPointType * __restrict__ input_points, std::uint32_t * index_tensor,
-  std::int32_t * ring_indexes, std::int32_t initial_max_rings, std::int32_t * output_max_rings,
-  std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring,
-  int num_points, cudaStream_t & stream);
-
-std::size_t querySortWorkspace(
-  int num_items, int num_segments, int * d_offsets, std::uint32_t * d_keys_in,
-  std::uint32_t * d_keys_out);
-
-void sortLaunch(
-  int num_items, int num_segments, int * d_offsets, std::uint32_t * d_keys_in,
-  std::uint32_t * d_keys_out, std::uint8_t * d_temp_storage, std::size_t temp_storage_bytes,
-  cudaStream_t & stream);  // e.g., [-, -, -, -, -, -, -]
-
-void gatherLaunch(
-  const InputPointType * __restrict__ input_points, const std::uint32_t * __restrict__ index_tensor,
-  InputPointType * __restrict__ output_points, int num_rings, int max_points_per_ring,
-  cudaStream_t & stream);
-
 struct TwistStruct2D
 {
   float cum_x;
@@ -62,8 +42,8 @@ struct TwistStruct2D
   float cum_theta;
   float cum_cos_theta;
   float cum_sin_theta;
-  uint32_t last_stamp_nsec;  // relative to the start of the pointcloud
-  uint32_t stamp_nsec;       // relative to the start of the pointcloud
+  std::uint32_t last_stamp_nsec;  // relative to the start of the pointcloud
+  std::uint32_t stamp_nsec;       // relative to the start of the pointcloud
   float v_x;
   float v_theta;
 };
@@ -71,8 +51,8 @@ struct TwistStruct2D
 struct TwistStruct3D
 {
   float cum_transform_buffer[16];
-  uint32_t last_stamp_nsec;  // relative to the start of the pointcloud
-  uint32_t stamp_nsec;       // relative to the start of the pointcloud
+  std::uint32_t last_stamp_nsec;  // relative to the start of the pointcloud
+  std::uint32_t stamp_nsec;       // relative to the start of the pointcloud
   float v[3];
   float w[3];
 };
@@ -143,29 +123,38 @@ class CudaPointcloudPreprocessor
   void preallocateOutput();
 
   std::unique_ptr<cuda_blackboard::CudaPointCloud2> process(
-    const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg,
+    const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr,
     const geometry_msgs::msg::TransformStamped & transform_msg,
     const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
-    const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue);
+    const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue,
+    const std::uint32_t first_point_rel_stamp_nsec);
 
 private:
   void setupTwist2DStructs(
-    const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg,
     const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
-    const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue);
+    const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue,
+    const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec);
 
   void setupTwist3DStructs(
-    const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg,
     const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
-    const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue);
+    const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue,
+    const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec);
+
+  std::size_t querySortWorkspace(
+    int num_items, int num_segments, int * offsets_device, std::uint32_t * keys_in_device,
+    std::uint32_t * keys_out_device);
+
+  void organizePointcloud();
 
   CropBoxParameters self_crop_box_parameters_{};
   CropBoxParameters mirror_crop_box_parameters_{};
   RingOutlierFilterParameters ring_outlier_parameters_{};
   bool use_3d_undistortion_{false};
 
-  int max_rings_{};
+  int num_rings_{};
   int max_points_per_ring_{};
+  std::size_t num_raw_points_{};
+  std::size_t num_organized_points_{};
 
   std::vector<sensor_msgs::msg::PointField> point_fields_{};
   std::unique_ptr<cuda_blackboard::CudaPointCloud2> output_pointcloud_ptr_{};
@@ -174,6 +163,21 @@ class CudaPointcloudPreprocessor
   int max_blocks_per_grid_{};
   const int threads_per_block_{256};
   cudaMemPool_t device_memory_pool_;
+
+  // Organizing buffers
+  thrust::device_vector<InputPointType> device_input_points_;
+  thrust::device_vector<InputPointType> device_organized_points_;
+  thrust::device_vector<std::int32_t> device_ring_index_;
+  thrust::device_vector<std::uint32_t> device_indexes_tensor_;
+  thrust::device_vector<std::uint32_t> device_sorted_indexes_tensor_;
+  thrust::device_vector<std::int32_t> device_segment_offsets_;
+  thrust::device_vector<std::int32_t> device_max_ring_;
+  thrust::device_vector<std::int32_t> device_max_points_per_ring_;
+
+  thrust::device_vector<std::uint8_t> device_sort_workspace_;
+  std::size_t sort_workspace_bytes_{0};
+
+  // Pointcloud preprocessing buffers
   thrust::device_vector<InputPointType> device_transformed_points_{};
   thrust::device_vector<OutputPointType> device_output_points_{};
   thrust::device_vector<std::uint32_t> device_crop_mask_{};
@@ -184,6 +188,8 @@ class CudaPointcloudPreprocessor
 
   thrust::device_vector<CropBoxParameters> host_crop_box_structs_{};
   thrust::device_vector<CropBoxParameters> device_crop_box_structs_{};
+  std::vector<TwistStruct2D> host_twist_2d_structs_;
+  std::vector<TwistStruct3D> host_twist_3d_structs_;
 };
 
 }  // namespace autoware::cuda_pointcloud_preprocessor
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
index 6a45a79b63c65..afc2f6cd74de0 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
@@ -36,7 +36,6 @@
 #include <tf2_ros/buffer.h>
 #include <tf2_ros/transform_listener.h>
 
-#include <chrono>
 #include <deque>
 #include <memory>
 #include <vector>
@@ -81,12 +80,6 @@ class CudaPointcloudPreprocessorNode : public rclcpp::Node
     const std::string & target_frame, const std::string & source_frame,
     tf2::Transform * tf2_transform_ptr);
 
-  void estimatePointcloudRingInfo(
-    const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr);
-
-  bool orderPointcloud(
-    const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr);
-
   // Callback
   void pointcloudCallback(
     const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr);
@@ -103,55 +96,18 @@ class CudaPointcloudPreprocessorNode : public rclcpp::Node
   std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> twist_queue_;
   std::deque<geometry_msgs::msg::Vector3Stamped> angular_velocity_queue_;
 
-  std::size_t num_rings_{0};
-  std::size_t max_points_per_ring_{0};
-  std::vector<std::size_t> next_ring_index_;
-
-  // Subscriber
+  // Subscribers
   rclcpp::Subscription<geometry_msgs::msg::TwistWithCovarianceStamped>::SharedPtr twist_sub_{};
   rclcpp::Subscription<sensor_msgs::msg::Imu>::SharedPtr imu_sub_{};
   rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::SharedPtr pointcloud_sub_{};
 
-  cuda_blackboard::CudaUniquePtr<InputPointType[]> points_device_;
-  cuda_blackboard::CudaUniquePtr<InputPointType[]> organized_points_device_;
-  cuda_blackboard::CudaUniquePtr<std::int32_t[]> ring_index_device_;
-  cuda_blackboard::CudaUniquePtr<std::uint32_t[]> indexes_tensor_device_;
-  cuda_blackboard::CudaUniquePtr<std::uint32_t[]> sorted_indexes_tensor_device_;
-  cuda_blackboard::CudaUniquePtr<std::int32_t[]> segment_offsets_device_;
-  cuda_blackboard::CudaUniquePtr<std::int32_t> max_rings_device_;
-  cuda_blackboard::CudaUniquePtr<std::int32_t> max_points_per_ring_device_;
-
-  cuda_blackboard::CudaUniquePtr<std::uint8_t[]> sort_workspace_device_;
-  std::size_t sort_workspace_bytes_{0};
-
-  std::int32_t test_num_rings_{0};
-  std::int32_t test_max_points_per_ring_{0};
-  cudaStream_t stream_;
-  std::int32_t capacity_{0};
-
-  cuda_blackboard::HostUniquePtr<autoware::point_types::PointXYZIRCAEDT[]> host_buffer_;
-  // cuda_blackboard::CudaUniquePtr<std::uint8_t[]> device_buffer_; TODO(knzo25) replaced by a
-  // CudaPointCloud
-  std::shared_ptr<cuda_blackboard::CudaPointCloud2> input_pointcloud_ptr_;
-
-  // CUDA pub & sub
+  // CUDA pub
   std::unique_ptr<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>> pub_;
-  // std::unique_ptr<cuda_blackboard::CudaBlackboardSubscriber<cuda_blackboard::CudaPointCloud2>>
-  // sub_;
 
   std::unique_ptr<CudaPointcloudPreprocessor> cuda_pointcloud_preprocessor_;
 
   std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_;
   std::unique_ptr<autoware::universe_utils::DebugPublisher> debug_publisher_;
-
-  double organize_time_cuda_ms_{0.0};
-  double copy_time_cuda_ms_{0.0};
-
-  double organize_time_ms_{0.0};
-  double copy_time_ms_{0.0};
-  double processing_time_ms_{0.0};
-  double kernel_time_ms_{0.0};
-  int iteration_{0};
 };
 
 }  // namespace autoware::cuda_pointcloud_preprocessor
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
deleted file mode 100644
index b7eeccafa2d5a..0000000000000
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-// Copyright 2024 TIER IV, Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp"
-
-#include <cuda_runtime.h>
-
-#include <vector>
-
-namespace autoware::cuda_organized_pointcloud_adapter
-{
-using sensor_msgs::msg::PointCloud2;
-
-CudaOrganizedPointcloudAdapterNode::CudaOrganizedPointcloudAdapterNode(
-  const rclcpp::NodeOptions & node_options)
-: Node("cuda_organized_pointcloud_adapter", node_options)
-{
-  pub_ =
-    std::make_unique<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>>(
-      *this, "~/output/pointcloud");
-
-  rclcpp::SubscriptionOptions sub_options;
-  sub_options.use_intra_process_comm = rclcpp::IntraProcessSetting::Enable;
-
-  pointcloud_sub_ = this->create_subscription<sensor_msgs::msg::PointCloud2>(
-    "~/input/pointcloud", rclcpp::SensorDataQoS{}.keep_last(1),
-    std::bind(&CudaOrganizedPointcloudAdapterNode::pointcloudCallback, this, std::placeholders::_1),
-    sub_options);
-
-  // initialize debug tool
-  {
-    using autoware::universe_utils::DebugPublisher;
-    using autoware::universe_utils::StopWatch;
-
-    stop_watch_ptr_ = std::make_unique<StopWatch<std::chrono::milliseconds>>();
-    debug_publisher_ = std::make_unique<DebugPublisher>(this, "cuda_organized_pointcloud_adapter");
-    stop_watch_ptr_->tic("processing_time");
-  }
-}
-
-void CudaOrganizedPointcloudAdapterNode::estimatePointcloudRingInfo(
-  const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr)
-{
-  const autoware::point_types::PointXYZIRCAEDT * input_buffer =
-    reinterpret_cast<const autoware::point_types::PointXYZIRCAEDT *>(
-      input_pointcloud_msg_ptr->data.data());
-
-  std::size_t max_ring = 0;
-
-  for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
-       i++) {
-    const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i];
-    const std::size_t ring = static_cast<std::size_t>(point.channel);
-    max_ring = std::max(max_ring, ring);
-  }
-
-  // Set max rings to the next power of two
-  num_rings_ = std::pow(2, std::ceil(std::log2(max_ring + 1)));
-  num_rings_ = std::max(num_rings_, static_cast<std::size_t>(16));
-  std::vector<std::size_t> ring_points(num_rings_, 0);
-
-  for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
-       i++) {
-    const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i];
-    const std::size_t ring = point.channel;
-    ring_points[ring]++;
-  }
-
-  // Set max points per ring to the next multiple of 512
-  max_points_per_ring_ = *std::max_element(ring_points.begin(), ring_points.end());
-  max_points_per_ring_ = std::max(max_points_per_ring_, static_cast<std::size_t>(512));
-  max_points_per_ring_ = (max_points_per_ring_ + 511) / 512 * 512;
-
-  next_ring_index_.resize(num_rings_);
-  std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0);
-  host_buffer_ = cuda_blackboard::make_host_unique<autoware::point_types::PointXYZIRCAEDT[]>(
-    num_rings_ * max_points_per_ring_);
-
-  device_buffer_ = cuda_blackboard::make_unique<std::uint8_t[]>(
-    num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT));
-
-  RCLCPP_INFO_STREAM(
-    get_logger(), "Estimated rings: " << num_rings_
-                                      << ", max_points_per_ring: " << max_points_per_ring_
-                                      << ". This should only be done during the first iterations. "
-                                         "Otherwise, performance will be affected.");
-}
-
-bool CudaOrganizedPointcloudAdapterNode::orderPointcloud(
-  const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr)
-{
-  const autoware::point_types::PointXYZIRCAEDT * input_buffer =
-    reinterpret_cast<const autoware::point_types::PointXYZIRCAEDT *>(
-      input_pointcloud_msg_ptr->data.data());
-
-  bool ring_overflow = false;
-  bool point_overflow = false;
-
-  autoware::point_types::PointXYZIRCAEDT * buffer = host_buffer_.get();
-
-  for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
-       i++) {
-    const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i];
-    const std::size_t raw_ring = point.channel;
-    const std::size_t ring = raw_ring % num_rings_;
-
-    const std::size_t raw_index = next_ring_index_[ring];
-    const std::size_t index = raw_index % max_points_per_ring_;
-
-    ring_overflow |= raw_ring >= num_rings_;
-    point_overflow |= raw_index >= max_points_per_ring_;
-
-    buffer[ring * max_points_per_ring_ + index] = point;
-    next_ring_index_[ring] = raw_index + 1;
-  }
-
-  return !ring_overflow && !point_overflow;
-}
-
-void CudaOrganizedPointcloudAdapterNode::pointcloudCallback(
-  const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr)
-{
-  stop_watch_ptr_->toc("processing_time", true);
-
-  // TODO(knzo25): check the pointcloud layout at least once
-
-  assert(input_pointcloud_msg_ptr->point_step == sizeof(autoware::point_types::PointXYZIRCAEDT));
-
-  if (num_rings_ == 0 || max_points_per_ring_ == 0) {
-    estimatePointcloudRingInfo(input_pointcloud_msg_ptr);
-  }
-
-  if (!orderPointcloud(input_pointcloud_msg_ptr)) {
-    estimatePointcloudRingInfo(input_pointcloud_msg_ptr);
-    orderPointcloud(input_pointcloud_msg_ptr);
-  }
-
-  // Copy to cuda memory
-  cudaMemcpy(
-    device_buffer_.get(), host_buffer_.get(),
-    num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT),
-    cudaMemcpyHostToDevice);
-
-  auto cuda_pointcloud_msg_ptr = std::make_unique<cuda_blackboard::CudaPointCloud2>();
-  cuda_pointcloud_msg_ptr->fields = input_pointcloud_msg_ptr->fields;
-  cuda_pointcloud_msg_ptr->width = max_points_per_ring_;
-  cuda_pointcloud_msg_ptr->height = num_rings_;
-  cuda_pointcloud_msg_ptr->point_step = sizeof(autoware::point_types::PointXYZIRCAEDT);
-  cuda_pointcloud_msg_ptr->row_step =
-    max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT);
-  cuda_pointcloud_msg_ptr->data =
-    std::move(device_buffer_); /*reinterpret_cast<uint8_t *>(device_buffer_)*/
-  ;
-  cuda_pointcloud_msg_ptr->is_dense = input_pointcloud_msg_ptr->is_dense;
-  cuda_pointcloud_msg_ptr->header = input_pointcloud_msg_ptr->header;
-
-  pub_->publish(std::move(cuda_pointcloud_msg_ptr));
-
-  if (debug_publisher_) {
-    const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true);
-    debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>(
-      "debug/processing_time_ms", processing_time_ms);
-
-    double now_stamp_seconds = rclcpp::Time(this->get_clock()->now()).seconds();
-    double cloud_stamp_seconds = rclcpp::Time(input_pointcloud_msg_ptr->header.stamp).seconds();
-
-    debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>(
-      "debug/latency_ms", 1000.f * (now_stamp_seconds - cloud_stamp_seconds));
-  }
-
-  // Allocate cuda memory
-  device_buffer_ = cuda_blackboard::make_unique<std::uint8_t[]>(
-    num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT));
-
-  // Clear indexes
-  std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0);
-
-  // Clear pointcloud buffer
-  auto host_buffer = host_buffer_.get();
-  for (std::size_t i = 0; i < num_rings_ * max_points_per_ring_; i++) {
-    host_buffer[i] = autoware::point_types::PointXYZIRCAEDT{};
-  }
-}
-
-}  // namespace autoware::cuda_organized_pointcloud_adapter
-
-#include "rclcpp_components/register_node_macro.hpp"
-RCLCPP_COMPONENTS_REGISTER_NODE(
-  autoware::cuda_organized_pointcloud_adapter::CudaOrganizedPointcloudAdapterNode)
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index d99d384b0c9ab..d49f6eeccce42 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -23,10 +23,8 @@
 #include <cuda_runtime.h>
 #include <tf2/utils.h>
 #include <thrust/execution_policy.h>
-#include <thrust/host_vector.h>
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
-// #include <cub/device/device_segmented_radix_sort.cuh>
 
 namespace autoware::cuda_pointcloud_preprocessor
 {
@@ -100,47 +98,6 @@ __global__ void organizeKernel(
   index_tensor[ring * initial_max_points_per_ring + next_offset] = idx;
 }
 
-void organizeLaunch(
-  const InputPointType * __restrict__ input_points, std::uint32_t * index_tensor,
-  std::int32_t * ring_indexes, std::int32_t initial_max_rings, std::int32_t * output_max_rings,
-  std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring,
-  int num_points, cudaStream_t & stream)
-{
-  int threads_per_block = 256;
-  int blocks_per_grid = (num_points + threads_per_block - 1) / threads_per_block;
-  organizeKernel<<<blocks_per_grid, threads_per_block>>>(
-    input_points, index_tensor, ring_indexes, initial_max_rings, output_max_rings,
-    initial_max_points_per_ring, output_max_points_per_ring, num_points);
-}
-
-std::size_t querySortWorkspace(
-  int num_items, int num_segments, int * d_offsets, std::uint32_t * d_keys_in,
-  std::uint32_t * d_keys_out)
-{
-  // Determine temporary device storage requirements
-  void * d_temp_storage = nullptr;
-  size_t temp_storage_bytes = 0;
-  cub::DeviceSegmentedRadixSort::SortKeys(
-    d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_offsets,
-    d_offsets + 1);
-
-  return temp_storage_bytes;
-}
-
-void sortLaunch(
-  int num_items, int num_segments, int * d_offsets, std::uint32_t * d_keys_in,
-  std::uint32_t * d_keys_out, std::uint8_t * d_temp_storage, std::size_t temp_storage_bytes,
-  cudaStream_t & stream)
-{
-  // Allocate temporary storage
-  // cudaMalloc(&d_temp_storage, temp_storage_bytes);
-
-  // Run sorting operation
-  cub::DeviceSegmentedRadixSort::SortKeys(
-    reinterpret_cast<void *>(d_temp_storage), temp_storage_bytes, d_keys_in, d_keys_out, num_items,
-    num_segments, d_offsets, d_offsets + 1, 0, sizeof(std::uint32_t) * 8, stream);
-}
-
 __global__ void gatherKernel(
   const InputPointType * __restrict__ input_points, const std::uint32_t * __restrict__ index_tensor,
   InputPointType * __restrict__ output_points, int num_rings, int max_points_per_ring)
@@ -162,18 +119,6 @@ __global__ void gatherKernel(
   }
 }
 
-void gatherLaunch(
-  const InputPointType * __restrict__ input_points, const std::uint32_t * __restrict__ index_tensor,
-  InputPointType * __restrict__ output_points, int num_rings, int max_points_per_ring,
-  cudaStream_t & stream)
-{
-  int threads_per_block = 256;
-  int blocks_per_grid =
-    (num_rings * max_points_per_ring + threads_per_block - 1) / threads_per_block;
-  gatherKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-    input_points, index_tensor, output_points, num_rings, max_points_per_ring);
-}
-
 __global__ void transformPointsKernel(
   const InputPointType * __restrict__ input_points, InputPointType * output_points, int num_points,
   TransformStruct transform)
@@ -221,8 +166,8 @@ __global__ void cropBoxKernel(
 }
 
 __global__ void combineMasksKernel(
-  const uint32_t * __restrict__ mask1, const uint32_t * __restrict__ mask2, int num_points,
-  uint32_t * __restrict__ output_mask)
+  const std::uint32_t * __restrict__ mask1, const std::uint32_t * __restrict__ mask2,
+  int num_points, std::uint32_t * __restrict__ output_mask)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < num_points) {
@@ -231,7 +176,7 @@ __global__ void combineMasksKernel(
 }
 
 __global__ void extractInputPointIndicesKernel(
-  InputPointType * input_points, uint32_t * masks, uint32_t * indices, int num_points,
+  InputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points,
   InputPointType * output_points)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -241,7 +186,7 @@ __global__ void extractInputPointIndicesKernel(
 }
 
 __global__ void extractOutputPointIndicesKernel(
-  OutputPointType * input_points, uint32_t * masks, uint32_t * indices, int num_points,
+  OutputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points,
   OutputPointType * output_points)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -251,7 +196,7 @@ __global__ void extractOutputPointIndicesKernel(
 }
 
 __global__ void extractInputPointsToOutputPoints_indicesKernel(
-  InputPointType * input_points, uint32_t * masks, uint32_t * indices, int num_points,
+  InputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points,
   OutputPointType * output_points)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -340,8 +285,9 @@ __global__ void undistort3dKernel(
 }
 
 __global__ void ringOutlierFilterKernel(
-  const InputPointType * d_points, uint32_t * output_mask, int num_rings, int max_points_per_ring,
-  float distance_ratio, float object_length_threshold_squared, int num_points_threshold)
+  const InputPointType * d_points, std::uint32_t * output_mask, int num_rings,
+  int max_points_per_ring, float distance_ratio, float object_length_threshold_squared,
+  int num_points_threshold)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   int j = idx / max_points_per_ring;
@@ -476,10 +422,73 @@ CudaPointcloudPreprocessor::CudaPointcloudPreprocessor()
   cudaMemPoolCreate(&device_memory_pool_, &pool_props);
   MemoryPoolAllocator<TwistStruct2D> allocator_2d(device_memory_pool_);
   MemoryPoolAllocator<TwistStruct3D> allocator_3d(device_memory_pool_);
+  MemoryPoolAllocator<std::int32_t> allocator_int32(device_memory_pool_);
+  MemoryPoolAllocator<std::uint32_t> allocator_uint32(device_memory_pool_);
+  MemoryPoolAllocator<std::uint8_t> allocator_uint8(device_memory_pool_);
+  MemoryPoolAllocator<InputPointType> allocator_points(device_memory_pool_);
+
   device_twist_2d_structs_ =
     thrust::device_vector<TwistStruct2D, MemoryPoolAllocator<TwistStruct2D>>(allocator_2d);
   device_twist_3d_structs_ =
     thrust::device_vector<TwistStruct3D, MemoryPoolAllocator<TwistStruct3D>>(allocator_3d);
+
+  num_rings_ = 1;
+  max_points_per_ring_ = 1;
+  num_organized_points_ = num_rings_ * max_points_per_ring_;
+  device_ring_index_ =
+    thrust::device_vector<std::int32_t, MemoryPoolAllocator<std::int32_t>>(allocator_int32);
+  device_ring_index_.resize(num_rings_);
+
+  device_indexes_tensor_ =
+    thrust::device_vector<std::uint32_t, MemoryPoolAllocator<std::uint32_t>>(allocator_uint32);
+  device_sorted_indexes_tensor_ =
+    thrust::device_vector<std::uint32_t, MemoryPoolAllocator<std::uint32_t>>(allocator_uint32);
+
+  device_indexes_tensor_.resize(num_organized_points_);
+  device_sorted_indexes_tensor_.resize(num_organized_points_);
+
+  device_segment_offsets_ =
+    thrust::device_vector<std::int32_t, MemoryPoolAllocator<std::int32_t>>(allocator_int32);
+  device_segment_offsets_.resize(num_rings_ + 1);
+  device_segment_offsets_[0] = 0;
+  device_segment_offsets_[1] = 1;
+
+  device_max_ring_ =
+    thrust::device_vector<std::int32_t, MemoryPoolAllocator<std::int32_t>>(allocator_int32);
+  device_max_ring_.resize(1);
+
+  device_max_points_per_ring_ =
+    thrust::device_vector<std::int32_t, MemoryPoolAllocator<std::int32_t>>(allocator_int32);
+  device_max_points_per_ring_.resize(1);
+
+  device_input_points_ =
+    thrust::device_vector<InputPointType, MemoryPoolAllocator<InputPointType>>(allocator_points);
+  device_organized_points_ =
+    thrust::device_vector<InputPointType, MemoryPoolAllocator<InputPointType>>(allocator_points);
+  device_organized_points_.resize(num_organized_points_);
+
+  cudaMemsetAsync(
+    thrust::raw_pointer_cast(device_max_ring_.data()), 0, sizeof(std::int32_t), stream_);
+  cudaMemsetAsync(
+    thrust::raw_pointer_cast(device_max_points_per_ring_.data()), 0, sizeof(std::int32_t), stream_);
+  cudaMemsetAsync(
+    thrust::raw_pointer_cast(device_indexes_tensor_.data()), 0x255, sizeof(std::uint32_t), stream_);
+
+  sort_workspace_bytes_ = querySortWorkspace(
+    num_rings_ * max_points_per_ring_, num_rings_,
+    thrust::raw_pointer_cast(device_segment_offsets_.data()),
+    thrust::raw_pointer_cast(device_indexes_tensor_.data()),
+    thrust::raw_pointer_cast(device_sorted_indexes_tensor_.data()));
+
+  device_sort_workspace_ =
+    thrust::device_vector<std::uint8_t, MemoryPoolAllocator<std::uint8_t>>(allocator_uint8);
+
+  device_transformed_points_.resize(num_organized_points_);
+  device_crop_mask_.resize(num_organized_points_);
+  device_ring_outlier_mask_.resize(num_organized_points_);
+  device_indices_.resize(num_organized_points_);
+
+  preallocateOutput();
 }
 
 void CudaPointcloudPreprocessor::setCropBoxParameters(
@@ -504,58 +513,51 @@ void CudaPointcloudPreprocessor::preallocateOutput()
 {
   output_pointcloud_ptr_ = std::make_unique<cuda_blackboard::CudaPointCloud2>();
   output_pointcloud_ptr_->data = cuda_blackboard::make_unique<std::uint8_t[]>(
-    max_rings_ * max_points_per_ring_ * sizeof(OutputPointType));
+    num_rings_ * max_points_per_ring_ * sizeof(OutputPointType));
 }
 
 void CudaPointcloudPreprocessor::setupTwist2DStructs(
-  const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg,
   const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
-  const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue)
+  const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue,
+  const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec)
 {
-  const InputPointType * device_input_points =
-    reinterpret_cast<const InputPointType *>(input_pointcloud_msg.data.get());
-  InputPointType first_point;
-  cudaMemcpy(&first_point, &device_input_points[0], sizeof(InputPointType), cudaMemcpyDeviceToHost);
+  host_twist_2d_structs_.clear();
+  host_twist_2d_structs_.reserve(twist_queue.size() + angular_velocity_queue.size());
 
   // Twist preprocessing
-
-  uint64_t pointcloud_stamp_nsec = 1'000'000'000 * input_pointcloud_msg.header.stamp.sec +
-                                   input_pointcloud_msg.header.stamp.nanosec;
-
-  thrust::host_vector<TwistStruct2D> host_twist_structs;
-
   float cum_x = 0;
   float cum_y = 0;
   float cum_theta = 0;
   // All time stamps from now on are in nsec from the "beginning of the pointcloud"
-  uint32_t last_stamp_nsec = first_point.time_stamp;
+  std::uint32_t last_stamp_nsec = first_point_rel_stamp_nsec;
 
   std::size_t twist_index = 0;
   std::size_t angular_velocity_index = 0;
 
   for (; twist_index < twist_queue.size() ||
          angular_velocity_index < angular_velocity_queue.size();) {
-    uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec;
+    std::uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec;
     float v_x, v_theta;
 
     if (twist_index < twist_queue.size()) {
       input_twist_global_stamp_nsec =
-        1'000'000'000 * static_cast<uint64_t>(twist_queue[twist_index].header.stamp.sec) +
-        static_cast<uint64_t>(twist_queue[twist_index].header.stamp.nanosec);
+        1'000'000'000 * static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.sec) +
+        static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.nanosec);
       v_x = twist_queue[twist_index].twist.twist.linear.x;
     } else {
-      input_twist_global_stamp_nsec = std::numeric_limits<uint64_t>::max();
+      input_twist_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max();
       v_x = 0.0;
     }
 
     if (angular_velocity_index < angular_velocity_queue.size()) {
       angular_velocity_global_stamp_nsec =
-        1'000'000'000 *
-          static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.sec) +
-        static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.nanosec);
+        1'000'000'000 * static_cast<std::uint64_t>(
+                          angular_velocity_queue[angular_velocity_index].header.stamp.sec) +
+        static_cast<std::uint64_t>(
+          angular_velocity_queue[angular_velocity_index].header.stamp.nanosec);
       v_theta = angular_velocity_queue[angular_velocity_index].vector.z;
     } else {
-      angular_velocity_global_stamp_nsec = std::numeric_limits<uint64_t>::max();
+      angular_velocity_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max();
       v_theta = 0.0;
     }
 
@@ -575,15 +577,16 @@ void CudaPointcloudPreprocessor::setupTwist2DStructs(
     twist.cum_y = cum_y;
     twist.cum_theta = cum_theta;
 
-    uint64_t twist_global_stamp_nsec = twist_stamp;
+    std::uint64_t twist_global_stamp_nsec = twist_stamp;
     assert(twist_global_stamp_nsec > pointcloud_stamp_nsec);  // by construction
-    uint32_t twist_from_pointcloud_start_nsec = twist_global_stamp_nsec - pointcloud_stamp_nsec;
+    std::uint32_t twist_from_pointcloud_start_nsec =
+      twist_global_stamp_nsec - pointcloud_stamp_nsec;
 
     twist.stamp_nsec = twist_from_pointcloud_start_nsec;
     twist.v_x = v_x;
     twist.v_theta = v_theta;
     twist.last_stamp_nsec = last_stamp_nsec;
-    host_twist_structs.push_back(twist);
+    host_twist_2d_structs_.push_back(twist);
 
     double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec);
     last_stamp_nsec = twist.stamp_nsec;
@@ -594,44 +597,39 @@ void CudaPointcloudPreprocessor::setupTwist2DStructs(
   }
 
   // Copy to device
-  device_twist_2d_structs_ = host_twist_structs;
+  device_twist_2d_structs_.resize(host_twist_2d_structs_.size());
+  cudaMemcpyAsync(
+    thrust::raw_pointer_cast(device_twist_2d_structs_.data()), host_twist_2d_structs_.data(),
+    host_twist_2d_structs_.size() * sizeof(TwistStruct2D), cudaMemcpyHostToDevice, stream_);
 }
 
 void CudaPointcloudPreprocessor::setupTwist3DStructs(
-  const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg,
   const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
-  const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue)
+  const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue,
+  const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec)
 {
-  const InputPointType * device_input_points =
-    reinterpret_cast<const InputPointType *>(input_pointcloud_msg.data.get());
-  InputPointType first_point;
-  cudaMemcpy(&first_point, &device_input_points[0], sizeof(InputPointType), cudaMemcpyDeviceToHost);
-
   // Twist preprocessing
-
-  uint64_t pointcloud_stamp_nsec = 1'000'000'000 * input_pointcloud_msg.header.stamp.sec +
-                                   input_pointcloud_msg.header.stamp.nanosec;
-
-  thrust::host_vector<TwistStruct3D> host_twist_structs;
+  host_twist_3d_structs_.clear();
+  host_twist_3d_structs_.reserve(twist_queue.size() + angular_velocity_queue.size());
 
   Eigen::Matrix4f cum_transform = Eigen::Matrix4f::Identity();
   Eigen::Vector3f v = Eigen::Vector3f::Zero();
   Eigen::Vector3f w = Eigen::Vector3f::Zero();
 
   // All time stamps from now on are in nsec from the "beginning of the pointcloud"
-  uint32_t last_stamp_nsec = first_point.time_stamp;
+  std::uint32_t last_stamp_nsec = first_point_rel_stamp_nsec;
 
   std::size_t twist_index = 0;
   std::size_t angular_velocity_index = 0;
 
   for (; twist_index < twist_queue.size() ||
          angular_velocity_index < angular_velocity_queue.size();) {
-    uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec;
+    std::uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec;
 
     if (twist_index < twist_queue.size()) {
       input_twist_global_stamp_nsec =
-        1'000'000'000 * static_cast<uint64_t>(twist_queue[twist_index].header.stamp.sec) +
-        static_cast<uint64_t>(twist_queue[twist_index].header.stamp.nanosec);
+        1'000'000'000 * static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.sec) +
+        static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.nanosec);
       v.x() = twist_queue[twist_index].twist.twist.linear.x;
       v.y() = twist_queue[twist_index].twist.twist.linear.y;
       v.z() = twist_queue[twist_index].twist.twist.linear.z;
@@ -642,14 +640,15 @@ void CudaPointcloudPreprocessor::setupTwist3DStructs(
 
     if (angular_velocity_index < angular_velocity_queue.size()) {
       angular_velocity_global_stamp_nsec =
-        1'000'000'000 *
-          static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.sec) +
-        static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.nanosec);
+        1'000'000'000 * static_cast<std::uint64_t>(
+                          angular_velocity_queue[angular_velocity_index].header.stamp.sec) +
+        static_cast<std::uint64_t>(
+          angular_velocity_queue[angular_velocity_index].header.stamp.nanosec);
       w.x() = angular_velocity_queue[angular_velocity_index].vector.x;
       w.y() = angular_velocity_queue[angular_velocity_index].vector.y;
       w.z() = angular_velocity_queue[angular_velocity_index].vector.z;
     } else {
-      angular_velocity_global_stamp_nsec = std::numeric_limits<uint64_t>::max();
+      angular_velocity_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max();
       w = Eigen::Vector3f::Zero();
     }
 
@@ -671,15 +670,16 @@ void CudaPointcloudPreprocessor::setupTwist3DStructs(
     Eigen::Map<Eigen::Vector3f> w_map(twist.w);
     cum_transform_buffer_map = cum_transform;
 
-    uint64_t twist_global_stamp_nsec = twist_stamp;
+    std::uint64_t twist_global_stamp_nsec = twist_stamp;
     assert(twist_global_stamp_nsec > pointcloud_stamp_nsec);  // by construction
-    uint32_t twist_from_pointcloud_start_nsec = twist_global_stamp_nsec - pointcloud_stamp_nsec;
+    std::uint32_t twist_from_pointcloud_start_nsec =
+      twist_global_stamp_nsec - pointcloud_stamp_nsec;
 
     twist.stamp_nsec = twist_from_pointcloud_start_nsec;
     v_map = v;
     w_map = w;
     twist.last_stamp_nsec = last_stamp_nsec;
-    host_twist_structs.push_back(twist);
+    host_twist_3d_structs_.push_back(twist);
 
     double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec);
     last_stamp_nsec = twist.stamp_nsec;
@@ -689,30 +689,154 @@ void CudaPointcloudPreprocessor::setupTwist3DStructs(
   }
 
   // Copy to device
-  device_twist_3d_structs_ = host_twist_structs;
+  device_twist_3d_structs_.resize(host_twist_3d_structs_.size());
+  cudaMemcpyAsync(
+    thrust::raw_pointer_cast(device_twist_3d_structs_.data()), host_twist_3d_structs_.data(),
+    host_twist_3d_structs_.size() * sizeof(TwistStruct3D), cudaMemcpyHostToDevice, stream_);
+}
+
+std::size_t CudaPointcloudPreprocessor::querySortWorkspace(
+  int num_items, int num_segments, int * offsets_device, std::uint32_t * keys_in_device,
+  std::uint32_t * keys_out_device)
+{
+  // Determine temporary device storage requirements
+  void * temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceSegmentedRadixSort::SortKeys(
+    temp_storage, temp_storage_bytes, keys_in_device, keys_out_device, num_items, num_segments,
+    offsets_device, offsets_device + 1);
+
+  return temp_storage_bytes;
+}
+
+void CudaPointcloudPreprocessor::organizePointcloud()
+{
+  cudaMemsetAsync(
+    thrust::raw_pointer_cast(device_ring_index_.data()), 0, num_rings_ * sizeof(std::int32_t),
+    stream_);
+  cudaMemsetAsync(
+    thrust::raw_pointer_cast(device_indexes_tensor_.data()), 0xFF,
+    num_organized_points_ * sizeof(std::uint32_t), stream_);
+
+  if (num_raw_points_ == 0) {
+    return;
+  }
+
+  const int raw_points_blocks_per_grid =
+    (num_raw_points_ + threads_per_block_ - 1) / threads_per_block_;
+
+  organizeKernel<<<raw_points_blocks_per_grid, threads_per_block_, 0, stream_>>>(
+    thrust::raw_pointer_cast(device_input_points_.data()),
+    thrust::raw_pointer_cast(device_indexes_tensor_.data()),
+    thrust::raw_pointer_cast(device_ring_index_.data()), num_rings_,
+    thrust::raw_pointer_cast(device_max_ring_.data()), max_points_per_ring_,
+    thrust::raw_pointer_cast(device_max_points_per_ring_.data()), num_raw_points_);
+
+  std::int32_t max_ring_value;
+  std::int32_t max_points_per_ring;
+
+  cudaMemcpyAsync(
+    &max_ring_value, thrust::raw_pointer_cast(device_max_ring_.data()), sizeof(std::int32_t),
+    cudaMemcpyDeviceToHost, stream_);
+  cudaMemcpyAsync(
+    &max_points_per_ring, thrust::raw_pointer_cast(device_max_points_per_ring_.data()),
+    sizeof(std::int32_t), cudaMemcpyDeviceToHost, stream_);
+  cudaStreamSynchronize(stream_);
+
+  if (max_ring_value >= num_rings_ || max_points_per_ring > max_points_per_ring_) {
+    num_rings_ = max_ring_value + 1;
+    max_points_per_ring_ = std::max((max_points_per_ring + 511) / 512 * 512, 512);
+    num_organized_points_ = num_rings_ * max_points_per_ring_;
+
+    device_ring_index_.resize(num_rings_);
+    device_indexes_tensor_.resize(num_organized_points_);
+    device_sorted_indexes_tensor_.resize(num_organized_points_);
+    device_segment_offsets_.resize(num_rings_ + 1);
+    device_organized_points_.resize(num_organized_points_);
+
+    device_transformed_points_.resize(num_organized_points_);
+    device_crop_mask_.resize(num_organized_points_);
+    device_ring_outlier_mask_.resize(num_organized_points_);
+    device_indices_.resize(num_organized_points_);
+
+    preallocateOutput();
+
+    std::vector<std::int32_t> segment_offsets_host(num_rings_ + 1);
+    for (std::size_t i = 0; i < num_rings_ + 1; i++) {
+      segment_offsets_host[i] = i * max_points_per_ring_;
+    }
+
+    cudaMemcpyAsync(
+      thrust::raw_pointer_cast(device_segment_offsets_.data()), segment_offsets_host.data(),
+      (num_rings_ + 1) * sizeof(std::int32_t), cudaMemcpyHostToDevice, stream_);
+
+    cudaMemsetAsync(
+      thrust::raw_pointer_cast(device_ring_index_.data()), 0, num_rings_ * sizeof(std::int32_t),
+      stream_);
+    cudaMemsetAsync(
+      thrust::raw_pointer_cast(device_indexes_tensor_.data()), 0xFF,
+      num_organized_points_ * sizeof(std::int32_t), stream_);
+
+    sort_workspace_bytes_ = querySortWorkspace(
+      num_organized_points_, num_rings_, thrust::raw_pointer_cast(device_segment_offsets_.data()),
+      thrust::raw_pointer_cast(device_indexes_tensor_.data()),
+      thrust::raw_pointer_cast(device_sorted_indexes_tensor_.data()));
+    device_sort_workspace_.resize(sort_workspace_bytes_);
+
+    organizeKernel<<<raw_points_blocks_per_grid, threads_per_block_, 0, stream_>>>(
+      thrust::raw_pointer_cast(device_input_points_.data()),
+      thrust::raw_pointer_cast(device_indexes_tensor_.data()),
+      thrust::raw_pointer_cast(device_ring_index_.data()), num_rings_,
+      thrust::raw_pointer_cast(device_max_ring_.data()), max_points_per_ring_,
+      thrust::raw_pointer_cast(device_max_points_per_ring_.data()), num_raw_points_);
+  }
+
+  if (num_organized_points_ == num_rings_) {
+    cudaMemcpyAsync(
+      thrust::raw_pointer_cast(device_sorted_indexes_tensor_.data()),
+      thrust::raw_pointer_cast(device_indexes_tensor_.data()),
+      num_organized_points_ * sizeof(std::uint32_t), cudaMemcpyDeviceToDevice, stream_);
+  } else {
+    cub::DeviceSegmentedRadixSort::SortKeys(
+      reinterpret_cast<void *>(thrust::raw_pointer_cast(device_sort_workspace_.data())),
+      sort_workspace_bytes_, thrust::raw_pointer_cast(device_indexes_tensor_.data()),
+      thrust::raw_pointer_cast(device_sorted_indexes_tensor_.data()), num_organized_points_,
+      num_rings_, thrust::raw_pointer_cast(device_segment_offsets_.data()),
+      thrust::raw_pointer_cast(device_segment_offsets_.data()) + 1, 0, sizeof(std::uint32_t) * 8,
+      stream_);
+  }
+
+  const int organized_points_blocks_per_grid =
+    (num_organized_points_ + threads_per_block_ - 1) / threads_per_block_;
+  gatherKernel<<<organized_points_blocks_per_grid, threads_per_block_, 0, stream_>>>(
+    thrust::raw_pointer_cast(device_input_points_.data()),
+    thrust::raw_pointer_cast(device_sorted_indexes_tensor_.data()),
+    thrust::raw_pointer_cast(device_organized_points_.data()), num_rings_, max_points_per_ring_);
 }
 
 std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::process(
-  const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg,
+  const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr,
   const geometry_msgs::msg::TransformStamped & transform_msg,
   const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
-  const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue)
+  const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue,
+  const std::uint32_t first_point_rel_stamp_nsec)
 {
-  auto frame_id = input_pointcloud_msg.header.frame_id;
+  auto frame_id = input_pointcloud_msg_ptr->header.frame_id;
+  num_raw_points_ = input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
+  num_organized_points_ = num_rings_ * max_points_per_ring_;
 
-  auto num_input_points = input_pointcloud_msg.width * input_pointcloud_msg.height;
-  if (
-    input_pointcloud_msg.width * input_pointcloud_msg.height > max_rings_ * max_points_per_ring_) {
-    max_rings_ = input_pointcloud_msg.height;
-    max_points_per_ring_ = input_pointcloud_msg.width;
+  if (num_raw_points_ > device_input_points_.size()) {
+    std::size_t new_capacity = (num_raw_points_ + 1024) / 1024 * 1024;
+    device_input_points_.resize(new_capacity);
+  }
 
-    device_transformed_points_.resize(num_input_points);
-    device_crop_mask_.resize(num_input_points);
-    device_ring_outlier_mask_.resize(num_input_points);
-    device_indices_.resize(num_input_points);
+  cudaMemcpyAsync(
+    thrust::raw_pointer_cast(device_input_points_.data()), input_pointcloud_msg_ptr->data.data(),
+    num_raw_points_ * sizeof(InputPointType), cudaMemcpyHostToDevice, stream_);
 
-    preallocateOutput();
-  }
+  cudaStreamSynchronize(stream_);
+
+  organizePointcloud();
 
   tf2::Quaternion rotation_quaternion(
     transform_msg.transform.rotation.x, transform_msg.transform.rotation.y,
@@ -735,15 +859,19 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr
   transform_struct.m33 = static_cast<float>(rotation_matrix[2][2]);
 
   // Twist preprocessing
-  const InputPointType * device_input_points =
-    reinterpret_cast<const InputPointType *>(input_pointcloud_msg.data.get());
+  std::uint64_t pointcloud_stamp_nsec = 1'000'000'000 * input_pointcloud_msg_ptr->header.stamp.sec +
+                                        input_pointcloud_msg_ptr->header.stamp.nanosec;
 
   if (use_3d_undistortion_) {
-    setupTwist3DStructs(input_pointcloud_msg, twist_queue, angular_velocity_queue);
+    setupTwist3DStructs(
+      twist_queue, angular_velocity_queue, pointcloud_stamp_nsec, first_point_rel_stamp_nsec);
   } else {
-    setupTwist2DStructs(input_pointcloud_msg, twist_queue, angular_velocity_queue);
+    setupTwist2DStructs(
+      twist_queue, angular_velocity_queue, pointcloud_stamp_nsec, first_point_rel_stamp_nsec);
   }
 
+  // check_error(frame_id, "After setup");
+
   // Obtain raw pointers for the kernels
   TwistStruct2D * device_twist_2d_structs =
     thrust::raw_pointer_cast(device_twist_2d_structs_.data());
@@ -752,60 +880,64 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr
   InputPointType * device_transformed_points =
     thrust::raw_pointer_cast(device_transformed_points_.data());
   std::uint32_t * device_crop_mask = thrust::raw_pointer_cast(device_crop_mask_.data());
-  uint32_t * device_ring_outlier_mask = thrust::raw_pointer_cast(device_ring_outlier_mask_.data());
-  uint32_t * device_indices = thrust::raw_pointer_cast(device_indices_.data());
+  std::uint32_t * device_ring_outlier_mask =
+    thrust::raw_pointer_cast(device_ring_outlier_mask_.data());
+  std::uint32_t * device_indices = thrust::raw_pointer_cast(device_indices_.data());
 
-  const int blocks_per_grid = (num_input_points + threads_per_block_ - 1) / threads_per_block_;
+  const int blocks_per_grid = (num_organized_points_ + threads_per_block_ - 1) / threads_per_block_;
 
   transformPointsKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>(
-    device_input_points, device_transformed_points, num_input_points, transform_struct);
+    thrust::raw_pointer_cast(device_organized_points_.data()), device_transformed_points,
+    num_organized_points_, transform_struct);
 
   int crop_box_blocks_per_grid = std::min(blocks_per_grid, max_blocks_per_grid_);
   if (host_crop_box_structs_.size() > 0) {
     cropBoxKernel<<<crop_box_blocks_per_grid, threads_per_block_, 0, stream_>>>(
-      device_transformed_points, device_crop_mask, num_input_points,
+      device_transformed_points, device_crop_mask, num_organized_points_,
       thrust::raw_pointer_cast(device_crop_box_structs_.data()), host_crop_box_structs_.size());
   } else {
-    thrust::fill(thrust::device, device_crop_mask, device_crop_mask + num_input_points, 1);
+    thrust::fill(thrust::device, device_crop_mask, device_crop_mask + num_organized_points_, 1);
   }
 
   if (use_3d_undistortion_ && device_twist_3d_structs_.size() > 0) {
     undistort3dKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>(
-      device_transformed_points, num_input_points, device_twist_3d_structs,
+      device_transformed_points, num_organized_points_, device_twist_3d_structs,
       device_twist_3d_structs_.size());
   } else if (!use_3d_undistortion_ && device_twist_2d_structs_.size() > 0) {
     undistort2dKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>(
-      device_transformed_points, num_input_points, device_twist_2d_structs,
+      device_transformed_points, num_organized_points_, device_twist_2d_structs,
       device_twist_2d_structs_.size());
   }
 
   ringOutlierFilterKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>(
-    device_transformed_points, device_ring_outlier_mask, max_rings_, max_points_per_ring_,
+    device_transformed_points, device_ring_outlier_mask, num_rings_, max_points_per_ring_,
     ring_outlier_parameters_.distance_ratio,
     ring_outlier_parameters_.object_length_threshold *
       ring_outlier_parameters_.object_length_threshold,
     ring_outlier_parameters_.num_points_threshold);
 
   combineMasksKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>(
-    device_crop_mask, device_ring_outlier_mask, num_input_points, device_ring_outlier_mask);
-
-  cudaStreamSynchronize(stream_);
+    device_crop_mask, device_ring_outlier_mask, num_organized_points_, device_ring_outlier_mask);
 
   thrust::inclusive_scan(
-    thrust::device, device_ring_outlier_mask, device_ring_outlier_mask + num_input_points,
+    thrust::device, device_ring_outlier_mask, device_ring_outlier_mask + num_organized_points_,
     device_indices);
 
-  uint32_t num_output_points;
-  cudaMemcpy(
-    &num_output_points, device_indices + num_input_points - 1, sizeof(uint32_t),
-    cudaMemcpyDeviceToHost);
+  std::uint32_t num_output_points;
+  cudaMemcpyAsync(
+    &num_output_points, device_indices + num_organized_points_ - 1, sizeof(std::uint32_t),
+    cudaMemcpyDeviceToHost, stream_);
+  cudaStreamSynchronize(stream_);
 
   if (num_output_points > 0) {
-    extractInputPointsToOutputPoints_indicesKernel<<<blocks_per_grid, threads_per_block_>>>(
-      device_transformed_points, device_ring_outlier_mask, device_indices, num_input_points,
+    extractInputPointsToOutputPoints_indicesKernel<<<
+      blocks_per_grid, threads_per_block_, 0, stream_>>>(
+      device_transformed_points, device_ring_outlier_mask, device_indices, num_organized_points_,
       reinterpret_cast<OutputPointType *>(output_pointcloud_ptr_->data.get()));
   }
 
+  cudaStreamSynchronize(stream_);
+
   // Copy the transformed points back
   output_pointcloud_ptr_->row_step = num_output_points * sizeof(OutputPointType);
   output_pointcloud_ptr_->width = num_output_points;
@@ -813,9 +945,9 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr
 
   output_pointcloud_ptr_->fields = point_fields_;
   output_pointcloud_ptr_->is_dense = true;
-  output_pointcloud_ptr_->is_bigendian = input_pointcloud_msg.is_bigendian;
+  output_pointcloud_ptr_->is_bigendian = input_pointcloud_msg_ptr->is_bigendian;
   output_pointcloud_ptr_->point_step = sizeof(OutputPointType);
-  output_pointcloud_ptr_->header.stamp = input_pointcloud_msg.header.stamp;
+  output_pointcloud_ptr_->header.stamp = input_pointcloud_msg_ptr->header.stamp;
 
   return std::move(output_pointcloud_ptr_);
 }
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
index 1aa634d8807dd..b2db15491f369 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
@@ -19,6 +19,7 @@
 #include <autoware/point_types/types.hpp>
 
 #include <geometry_msgs/msg/transform_stamped.hpp>
+#include <sensor_msgs/point_cloud2_iterator.hpp>
 #include <tf2_geometry_msgs/tf2_geometry_msgs.hpp>
 
 #include <cuda_runtime.h>
@@ -27,8 +28,6 @@
 #include <pcl/point_types.h>
 
 #include <chrono>
-#include <chrono>  // TODO(knzo25): remove this
-#include <limits>  // TODO(knzo25): remove this
 #include <string>
 #include <vector>
 
@@ -126,32 +125,6 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode(
     debug_publisher_ = std::make_unique<DebugPublisher>(this, "cuda_pointcloud_preprocessor");
     stop_watch_ptr_->tic("processing_time");
   }
-
-  // New organized pointcloud method
-  test_num_rings_ = 1;
-  test_max_points_per_ring_ = 1;
-  ring_index_device_ = cuda_blackboard::make_unique<std::int32_t[]>(test_num_rings_);
-  indexes_tensor_device_ =
-    cuda_blackboard::make_unique<std::uint32_t[]>(test_num_rings_ * test_max_points_per_ring_);
-  sorted_indexes_tensor_device_ =
-    cuda_blackboard::make_unique<std::uint32_t[]>(test_num_rings_ * test_max_points_per_ring_);
-  segment_offsets_device_ = cuda_blackboard::make_unique<std::int32_t[]>(test_num_rings_ + 1);
-  max_rings_device_ = cuda_blackboard::make_unique<std::int32_t>();
-  max_points_per_ring_device_ = cuda_blackboard::make_unique<std::int32_t>();
-  organized_points_device_ =
-    cuda_blackboard::make_unique<InputPointType[]>(test_num_rings_ * test_max_points_per_ring_);
-
-  cudaStreamCreate(&stream_);
-  cudaMemsetAsync(max_rings_device_.get(), 0, sizeof(std::int32_t), stream_);
-  cudaMemsetAsync(max_points_per_ring_device_.get(), 0, sizeof(std::int32_t), stream_);
-
-  sort_workspace_bytes_ = querySortWorkspace(
-    test_num_rings_ * test_max_points_per_ring_, test_num_rings_, segment_offsets_device_.get(),
-    indexes_tensor_device_.get(), sorted_indexes_tensor_device_.get());
-  sort_workspace_device_ = cuda_blackboard::make_unique<std::uint8_t[]>(sort_workspace_bytes_);
-
-  capacity_ = 8192;
-  points_device_ = cuda_blackboard::make_unique<InputPointType[]>(capacity_);
 }
 
 bool CudaPointcloudPreprocessorNode::getTransform(
@@ -231,356 +204,9 @@ void CudaPointcloudPreprocessorNode::imuCallback(
   }
 }
 
-void CudaPointcloudPreprocessorNode::estimatePointcloudRingInfo(
-  const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr)
-{
-  const autoware::point_types::PointXYZIRCAEDT * input_buffer =
-    reinterpret_cast<const autoware::point_types::PointXYZIRCAEDT *>(
-      input_pointcloud_msg_ptr->data.data());
-
-  std::size_t max_ring = 0;
-
-  for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
-       i++) {
-    const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i];
-    const std::size_t ring = static_cast<std::size_t>(point.channel);
-    max_ring = std::max(max_ring, ring);
-  }
-
-  // Set max rings to the next power of two
-  num_rings_ = std::pow(2, std::ceil(std::log2(max_ring + 1)));
-  num_rings_ = std::max(num_rings_, static_cast<std::size_t>(16));
-  std::vector<std::size_t> ring_points(num_rings_, 0);
-
-  for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
-       i++) {
-    const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i];
-    const std::size_t ring = point.channel;
-    ring_points[ring]++;
-  }
-
-  // Set max points per ring to the next multiple of 512
-  max_points_per_ring_ = *std::max_element(ring_points.begin(), ring_points.end());
-  max_points_per_ring_ = std::max(max_points_per_ring_, static_cast<std::size_t>(512));
-  max_points_per_ring_ = (max_points_per_ring_ + 511) / 512 * 512;
-
-  next_ring_index_.resize(num_rings_);
-  std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0);
-  host_buffer_ = cuda_blackboard::make_host_unique<autoware::point_types::PointXYZIRCAEDT[]>(
-    num_rings_ * max_points_per_ring_);
-
-  input_pointcloud_ptr_ = std::make_shared<cuda_blackboard::CudaPointCloud2>();
-  input_pointcloud_ptr_->data = cuda_blackboard::make_unique<std::uint8_t[]>(
-    num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT));
-  input_pointcloud_ptr_->width = max_points_per_ring_;
-  input_pointcloud_ptr_->height = num_rings_;
-  input_pointcloud_ptr_->point_step = sizeof(autoware::point_types::PointXYZIRCAEDT);
-  input_pointcloud_ptr_->row_step =
-    max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT);
-  input_pointcloud_ptr_->is_dense = input_pointcloud_msg_ptr->is_dense;
-  input_pointcloud_ptr_->fields = input_pointcloud_msg_ptr->fields;
-  input_pointcloud_ptr_->header = input_pointcloud_msg_ptr->header;
-
-  RCLCPP_INFO_STREAM(
-    get_logger(), "Estimated rings: " << num_rings_
-                                      << ", max_points_per_ring: " << max_points_per_ring_
-                                      << ". This should only be done during the first iterations. "
-                                         "Otherwise, performance will be affected.");
-}
-
-bool CudaPointcloudPreprocessorNode::orderPointcloud(
-  const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr)
-{
-  const autoware::point_types::PointXYZIRCAEDT * input_buffer =
-    reinterpret_cast<const autoware::point_types::PointXYZIRCAEDT *>(
-      input_pointcloud_msg_ptr->data.data());
-
-  bool ring_overflow = false;
-  bool point_overflow = false;
-
-  autoware::point_types::PointXYZIRCAEDT * buffer = host_buffer_.get();
-
-  for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
-       i++) {
-    const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i];
-    const std::size_t raw_ring = point.channel;
-    const std::size_t ring = raw_ring % num_rings_;
-
-    const std::size_t raw_index = next_ring_index_[ring];
-    const std::size_t index = raw_index % max_points_per_ring_;
-
-    ring_overflow |= raw_ring >= num_rings_;
-    point_overflow |= raw_index >= max_points_per_ring_;
-
-    buffer[ring * max_points_per_ring_ + index] = point;
-    next_ring_index_[ring] = raw_index + 1;
-  }
-
-  return !ring_overflow && !point_overflow;
-}
-
 void CudaPointcloudPreprocessorNode::pointcloudCallback(
   const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr)
 {
-  // NEW Cuda version
-  if (input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height > capacity_) {
-    capacity_ = input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
-    capacity_ = (capacity_ + 1024) / 1024 * 1024;
-    points_device_ = cuda_blackboard::make_unique<InputPointType[]>(capacity_);
-  }
-
-  double cuda_t0 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
-                            std::chrono::high_resolution_clock::now().time_since_epoch())
-                            .count();
-
-  cudaMemcpyAsync(
-    points_device_.get(), input_pointcloud_msg_ptr->data.data(),
-    input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height * sizeof(InputPointType),
-    cudaMemcpyHostToDevice, stream_);
-  cudaStreamSynchronize(stream_);
-
-  double cuda_t1 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
-                            std::chrono::high_resolution_clock::now().time_since_epoch())
-                            .count();
-
-  // Estimate the number of rings and max points per ring
-  cudaMemsetAsync(ring_index_device_.get(), 0, test_num_rings_ * sizeof(std::int32_t), stream_);
-  cudaMemsetAsync(
-    indexes_tensor_device_.get(), 0xFF,
-    test_num_rings_ * test_max_points_per_ring_ * sizeof(std::uint32_t), stream_);
-  organizeLaunch(
-    points_device_.get(), indexes_tensor_device_.get(), ring_index_device_.get(), test_num_rings_,
-    max_rings_device_.get(), test_max_points_per_ring_, max_points_per_ring_device_.get(),
-    input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height, stream_);
-  cudaStreamSynchronize(stream_);
-
-  std::int32_t max_ring_value;
-  std::int32_t max_points_per_ring;
-  cudaMemcpyAsync(
-    &max_ring_value, max_rings_device_.get(), sizeof(std::int32_t), cudaMemcpyDeviceToHost,
-    stream_);
-  cudaMemcpyAsync(
-    &max_points_per_ring, max_points_per_ring_device_.get(), sizeof(std::int32_t),
-    cudaMemcpyDeviceToHost, stream_);
-  cudaStreamSynchronize(stream_);
-
-  if (max_ring_value >= test_num_rings_ || max_points_per_ring > test_max_points_per_ring_) {
-    RCLCPP_WARN(
-      get_logger(), "Current ring num: %d, Current Max points per ring: %d", test_num_rings_,
-      test_max_points_per_ring_);
-    test_num_rings_ = max_ring_value + 1;
-    test_max_points_per_ring_ =
-      std::max((max_points_per_ring + 511) / 512 * 512, 512);  // Re think this equation
-    ring_index_device_ = cuda_blackboard::make_unique<std::int32_t[]>(test_num_rings_);
-    indexes_tensor_device_ =
-      cuda_blackboard::make_unique<std::uint32_t[]>(test_num_rings_ * test_max_points_per_ring_);
-    sorted_indexes_tensor_device_ =
-      cuda_blackboard::make_unique<std::uint32_t[]>(test_num_rings_ * test_max_points_per_ring_);
-    segment_offsets_device_ = cuda_blackboard::make_unique<std::int32_t[]>(test_num_rings_ + 1);
-    organized_points_device_ =
-      cuda_blackboard::make_unique<InputPointType[]>(test_num_rings_ * test_max_points_per_ring_);
-    RCLCPP_WARN(
-      get_logger(), "NEW Max ring value: %d, Max points per ring: %d", max_ring_value,
-      test_max_points_per_ring_);
-
-    std::vector<std::int32_t> segment_offsets_host(test_num_rings_ + 1);
-    for (std::size_t i = 0; i < test_num_rings_ + 1; i++) {
-      segment_offsets_host[i] = i * test_max_points_per_ring_;
-    }
-    cudaMemcpyAsync(
-      segment_offsets_device_.get(), segment_offsets_host.data(),
-      (test_num_rings_ + 1) * sizeof(std::int32_t), cudaMemcpyHostToDevice, stream_);
-
-    cudaMemsetAsync(ring_index_device_.get(), 0, test_num_rings_ * sizeof(std::int32_t), stream_);
-    cudaMemsetAsync(
-      indexes_tensor_device_.get(), 0xFF,
-      test_num_rings_ * test_max_points_per_ring_ * sizeof(std::int32_t), stream_);
-
-    sort_workspace_bytes_ = querySortWorkspace(
-      test_num_rings_ * test_max_points_per_ring_, test_num_rings_, segment_offsets_device_.get(),
-      indexes_tensor_device_.get(), sorted_indexes_tensor_device_.get());
-    sort_workspace_device_ = cuda_blackboard::make_unique<std::uint8_t[]>(sort_workspace_bytes_);
-
-    organizeLaunch(
-      points_device_.get(), indexes_tensor_device_.get(), ring_index_device_.get(), test_num_rings_,
-      max_rings_device_.get(), test_max_points_per_ring_, max_points_per_ring_device_.get(),
-      input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height, stream_);
-    // cudaStreamSynchronize(stream_);
-  }
-
-  /* std::vector<std::int32_t> ring_index_host(test_num_rings_);
-  cudaMemcpyAsync(
-    ring_index_host.data(), ring_index_device_.get(), test_num_rings_ * sizeof(std::int32_t),
-  cudaMemcpyDeviceToHost, stream_); cudaStreamSynchronize(stream_);
-
-  for (std::size_t i = 0; i < test_num_rings_; i++) {
-    if (ring_index_host[i] > 0) {
-      std::cout << "Ring id=" << i << " has " << ring_index_host[i] << " points" << std::endl;
-    }
-  } */
-
-  sortLaunch(
-    test_num_rings_ * test_max_points_per_ring_, test_num_rings_, segment_offsets_device_.get(),
-    indexes_tensor_device_.get(), sorted_indexes_tensor_device_.get(), sort_workspace_device_.get(),
-    sort_workspace_bytes_, stream_);
-  cudaStreamSynchronize(stream_);
-
-  /* std::vector<std::uint32_t> indexes_tensor_host(test_num_rings_ * test_max_points_per_ring_);
-  cudaMemcpyAsync(
-    indexes_tensor_host.data(), sorted_indexes_tensor_device_.get(),
-    test_num_rings_ * test_max_points_per_ring_ * sizeof(std::uint32_t), cudaMemcpyDeviceToHost,
-  stream_); cudaStreamSynchronize(stream_);
-
-  if (test_num_rings_ < 64) {
-    for (std::size_t i = 0; i < test_num_rings_; i++) {
-      std::cout << "Ring id=" << i << "\n\t";
-      for (std::size_t j = 0; j < test_max_points_per_ring_; j++) {
-        if (indexes_tensor_host[i * test_max_points_per_ring_ + j] <
-  std::numeric_limits<std::uint32_t>::max()) { std::cout << indexes_tensor_host[i *
-  test_max_points_per_ring_ + j] << " ";
-        }
-      }
-      std::cout << std::endl << std::flush;
-    }
-  } */
-
-  gatherLaunch(
-    points_device_.get(), sorted_indexes_tensor_device_.get(), organized_points_device_.get(),
-    test_num_rings_, test_max_points_per_ring_, stream_);
-  cudaStreamSynchronize(
-    stream_);  // this will not be needed in the final implementation. for now we benchmark
-
-  double cuda_t2 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
-                            std::chrono::high_resolution_clock::now().time_since_epoch())
-                            .count();
-
-  if (iteration_ >= 0) {
-    organize_time_cuda_ms_ += (cuda_t1 - cuda_t0);
-    copy_time_cuda_ms_ += (cuda_t2 - cuda_t1);
-  }
-
-  stop_watch_ptr_->toc("processing_time", true);
-
-  // TODO(knzo25): check the pointcloud layout at least once
-
-  assert(input_pointcloud_msg_ptr->point_step == sizeof(autoware::point_types::PointXYZIRCAEDT));
-
-  if (num_rings_ == 0 || max_points_per_ring_ == 0) {
-    estimatePointcloudRingInfo(input_pointcloud_msg_ptr);
-  }
-
-  double t1 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
-                       std::chrono::high_resolution_clock::now().time_since_epoch())
-                       .count();
-  if (!orderPointcloud(input_pointcloud_msg_ptr)) {
-    estimatePointcloudRingInfo(input_pointcloud_msg_ptr);
-    orderPointcloud(input_pointcloud_msg_ptr);
-  }
-  double t2 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
-                       std::chrono::high_resolution_clock::now().time_since_epoch())
-                       .count();
-
-  // Copy to cuda memory
-  cudaMemcpy(
-    input_pointcloud_ptr_->data.get(), host_buffer_.get(),
-    num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT),
-    cudaMemcpyHostToDevice);
-
-  double t3 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
-                       std::chrono::high_resolution_clock::now().time_since_epoch())
-                       .count();
-
-  if (iteration_ >= 0) {
-    organize_time_ms_ += (t2 - t1);
-    copy_time_ms_ += (t3 - t2);
-  }
-
-  // Test -> this would replace the cpu version
-  cudaMemcpyAsync(
-    input_pointcloud_ptr_->data.get(), organized_points_device_.get(),
-    test_num_rings_ * test_max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT),
-    cudaMemcpyHostToDevice, stream_);
-  input_pointcloud_ptr_->height = test_num_rings_;
-  input_pointcloud_ptr_->width = test_max_points_per_ring_;
-  cudaStreamSynchronize(stream_);
-
-  // TEST CODE
-  {
-    std::vector<autoware::point_types::PointXYZIRCAEDT> host_buffer_test(
-      test_num_rings_ * test_max_points_per_ring_);
-    cudaMemcpyAsync(
-      host_buffer_test.data(), organized_points_device_.get(),
-      test_num_rings_ * test_max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT),
-      cudaMemcpyDeviceToHost, stream_);
-    cudaStreamSynchronize(stream_);
-
-    if (
-      test_num_rings_ > 1 && test_max_points_per_ring_ == max_points_per_ring_ &&
-      test_num_rings_ == num_rings_) {
-      for (std::size_t i = 0; i < num_rings_; i++) {
-        for (std::size_t j = 0; j < max_points_per_ring_; j++) {
-          // Compare the implementations
-          if (
-            (host_buffer_.get()[i * max_points_per_ring_ + j].x !=
-               host_buffer_test[i * max_points_per_ring_ + j].x ||
-             host_buffer_.get()[i * max_points_per_ring_ + j].y !=
-               host_buffer_test[i * max_points_per_ring_ + j].y ||
-             host_buffer_.get()[i * max_points_per_ring_ + j].z !=
-               host_buffer_test[i * max_points_per_ring_ + j].z) &&
-            host_buffer_test[i * max_points_per_ring_ + j].distance > 0) {
-            RCLCPP_ERROR(get_logger(), "Mismatch at ring=%d, point=%d", i, j);
-            RCLCPP_ERROR(
-              get_logger(), "\thost_buffer_test: x=%.2f y=%.2f z=%.2f channel=%d distance=%.2f",
-              host_buffer_test[i * max_points_per_ring_ + j].x,
-              host_buffer_test[i * max_points_per_ring_ + j].y,
-              host_buffer_test[i * max_points_per_ring_ + j].z,
-              host_buffer_test[i * max_points_per_ring_ + j].channel,
-              host_buffer_test[i * max_points_per_ring_ + j].distance);
-            RCLCPP_ERROR(
-              get_logger(), "\thost_buffer: x=%.2f y=%.2f z=%.2f channel=%d distance=%.2f",
-              host_buffer_.get()[i * max_points_per_ring_ + j].x,
-              host_buffer_.get()[i * max_points_per_ring_ + j].y,
-              host_buffer_.get()[i * max_points_per_ring_ + j].z,
-              host_buffer_.get()[i * max_points_per_ring_ + j].channel,
-              host_buffer_.get()[i * max_points_per_ring_ + j].distance);
-          }
-        }
-      }
-    }
-  }
-
-  input_pointcloud_ptr_->header = input_pointcloud_msg_ptr->header;
-
-  cudaPointcloudCallback(input_pointcloud_ptr_);
-
-  if (debug_publisher_) {
-    const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true);
-    debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>(
-      "debug/processing_time_ms", processing_time_ms);
-
-    double now_stamp_seconds = rclcpp::Time(this->get_clock()->now()).seconds();
-    double cloud_stamp_seconds = rclcpp::Time(input_pointcloud_msg_ptr->header.stamp).seconds();
-
-    debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>(
-      "debug/latency_ms", 1000.f * (now_stamp_seconds - cloud_stamp_seconds));
-  }
-
-  // Clear indexes
-  std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0);
-
-  // Clear pointcloud buffer
-  auto host_buffer = host_buffer_.get();
-  for (std::size_t i = 0; i < num_rings_ * max_points_per_ring_; i++) {
-    host_buffer[i] = autoware::point_types::PointXYZIRCAEDT{};
-  }
-}
-
-void CudaPointcloudPreprocessorNode::cudaPointcloudCallback(
-  const std::shared_ptr<const cuda_blackboard::CudaPointCloud2> input_pointcloud_msg_ptr)
-{
-  double t1 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
-                       std::chrono::high_resolution_clock::now().time_since_epoch())
-                       .count();
   static_assert(
     sizeof(InputPointType) == sizeof(autoware::point_types::PointXYZIRCAEDT),
     "PointStruct and PointXYZIRCAEDT must have the same size");
@@ -588,13 +214,14 @@ void CudaPointcloudPreprocessorNode::cudaPointcloudCallback(
   stop_watch_ptr_->toc("processing_time", true);
 
   // Make sure that the first twist is newer than the first point
-  InputPointType first_point;
-  cudaMemcpy(
-    &first_point, input_pointcloud_msg_ptr->data.get(), sizeof(InputPointType),
-    cudaMemcpyDeviceToHost);
+  sensor_msgs::PointCloud2ConstIterator<std::uint32_t> iter_stamp(
+    *input_pointcloud_msg_ptr, "time_stamp");
+
+  auto num_points = input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height;
+  std::uint32_t first_point_rel_stamp = num_points > 0 ? *iter_stamp : 0;
   double first_point_stamp = input_pointcloud_msg_ptr->header.stamp.sec +
                              input_pointcloud_msg_ptr->header.stamp.nanosec * 1e-9 +
-                             first_point.time_stamp * 1e-9;
+                             first_point_rel_stamp * 1e-9;
 
   while (twist_queue_.size() > 1 &&
          rclcpp::Time(twist_queue_.front().header.stamp).seconds() < first_point_stamp) {
@@ -617,42 +244,11 @@ void CudaPointcloudPreprocessorNode::cudaPointcloudCallback(
     return;
   }
 
-  double t2 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
-                       std::chrono::high_resolution_clock::now().time_since_epoch())
-                       .count();
-
   auto output_pointcloud_ptr = cuda_pointcloud_preprocessor_->process(
-    *input_pointcloud_msg_ptr, transform_msg, twist_queue_, angular_velocity_queue_);
+    input_pointcloud_msg_ptr, transform_msg, twist_queue_, angular_velocity_queue_,
+    first_point_rel_stamp);
   output_pointcloud_ptr->header.frame_id = base_frame_;
 
-  double t3 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>(
-                       std::chrono::high_resolution_clock::now().time_since_epoch())
-                       .count();
-
-  if (iteration_ >= 0) {
-    processing_time_ms_ += (t2 - t1);
-    kernel_time_ms_ += (t3 - t2);
-  }
-  iteration_++;
-
-  if (num_rings_ > 64) {
-    RCLCPP_INFO(
-      get_logger(),
-      "Organize time: %f ms, Copy time: %f ms, Process time: %f ms, Kernel time: %f ms Total time: "
-      "%f ms",
-      organize_time_ms_ / iteration_, copy_time_ms_ / iteration_, processing_time_ms_ / iteration_,
-      kernel_time_ms_ / iteration_,
-      (organize_time_ms_ + copy_time_ms_ + processing_time_ms_ + kernel_time_ms_) / iteration_);
-    RCLCPP_INFO(
-      get_logger(),
-      "\t Cuda Organize time: %f ms, Cuda Copy time: %f ms, Cuda Process time: %f ms, Cuda Kernel "
-      "time: %f ms Total time: %f ms",
-      organize_time_cuda_ms_ / iteration_, copy_time_cuda_ms_ / iteration_,
-      processing_time_ms_ / iteration_, kernel_time_ms_ / iteration_,
-      (organize_time_cuda_ms_ + copy_time_cuda_ms_ + processing_time_ms_ + kernel_time_ms_) /
-        iteration_);
-  }
-
   // Publish
   pub_->publish(std::move(output_pointcloud_ptr));
 

From c55429062aca9be4658adfb6977a454693076d29 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Wed, 5 Feb 2025 16:36:41 +0900
Subject: [PATCH 27/38] chore: removed redundant ternay operator

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../cuda_pointcloud_preprocessor.cu                       | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index d49f6eeccce42..ffc2216006d74 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -334,11 +334,9 @@ __global__ void ringOutlierFilterKernel(
   const float y = left_point.y - right_point.y;
   const float z = left_point.z - right_point.z;
 
-  output_mask[j * max_points_per_ring + i] =
-    ((walk_size > num_points_threshold) ||
-     (x * x + y * y + z * z >= object_length_threshold_squared))
-      ? 1
-      : 0;
+  output_mask[j * max_points_per_ring + i] = static_cast<std::uint32_t>(
+    (walk_size > num_points_threshold) ||
+    (x * x + y * y + z * z >= object_length_threshold_squared));
 }
 
 __global__ void transformPointTypeKernel(

From d6ad9db83987f4e7f9cc4283b3787909d237562c Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Wed, 5 Feb 2025 16:52:21 +0900
Subject: [PATCH 28/38] chore: added a temporary memory check. the check will
 be unified in a later PR

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../cuda_pointcloud_preprocessor/memory.hpp   | 91 +++++++++++++++++++
 .../cuda_pointcloud_preprocessor_node.cpp     |  6 ++
 2 files changed, 97 insertions(+)
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/memory.hpp

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/memory.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/memory.hpp
new file mode 100644
index 0000000000000..9482f170be550
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/memory.hpp
@@ -0,0 +1,91 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__MEMORY_HPP_
+#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__MEMORY_HPP_
+
+#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
+
+#include <sensor_msgs/msg/point_cloud2.hpp>
+
+#include <vector>
+
+namespace autoware::cuda_pointcloud_preprocessor
+{
+
+// TODO(knzo25): this checks already exists within the pointcloud_preprocessor. After this PR is
+// merged I will move it somewhere all classes that require checks can access it
+bool is_data_layout_compatible_with_point_xyzircaedt(
+  const std::vector<sensor_msgs::msg::PointField> & fields)
+{
+  if (fields.size() != 10) {
+    return false;
+  }
+  bool same_layout = true;
+  const auto & field_x = fields.at(0);
+  same_layout &= field_x.name == "x";
+  same_layout &= field_x.offset == offsetof(InputPointType, x);
+  same_layout &= field_x.datatype == sensor_msgs::msg::PointField::FLOAT32;
+  same_layout &= field_x.count == 1;
+  const auto & field_y = fields.at(1);
+  same_layout &= field_y.name == "y";
+  same_layout &= field_y.offset == offsetof(InputPointType, y);
+  same_layout &= field_y.datatype == sensor_msgs::msg::PointField::FLOAT32;
+  same_layout &= field_y.count == 1;
+  const auto & field_z = fields.at(2);
+  same_layout &= field_z.name == "z";
+  same_layout &= field_z.offset == offsetof(InputPointType, z);
+  same_layout &= field_z.datatype == sensor_msgs::msg::PointField::FLOAT32;
+  same_layout &= field_z.count == 1;
+  const auto & field_intensity = fields.at(3);
+  same_layout &= field_intensity.name == "intensity";
+  same_layout &= field_intensity.offset == offsetof(InputPointType, intensity);
+  same_layout &= field_intensity.datatype == sensor_msgs::msg::PointField::UINT8;
+  same_layout &= field_intensity.count == 1;
+  const auto & field_return_type = fields.at(4);
+  same_layout &= field_return_type.name == "return_type";
+  same_layout &= field_return_type.offset == offsetof(InputPointType, return_type);
+  same_layout &= field_return_type.datatype == sensor_msgs::msg::PointField::UINT8;
+  same_layout &= field_return_type.count == 1;
+  const auto & field_ring = fields.at(5);
+  same_layout &= field_ring.name == "channel";
+  same_layout &= field_ring.offset == offsetof(InputPointType, channel);
+  same_layout &= field_ring.datatype == sensor_msgs::msg::PointField::UINT16;
+  same_layout &= field_ring.count == 1;
+  const auto & field_azimuth = fields.at(6);
+  same_layout &= field_azimuth.name == "azimuth";
+  same_layout &= field_azimuth.offset == offsetof(InputPointType, azimuth);
+  same_layout &= field_azimuth.datatype == sensor_msgs::msg::PointField::FLOAT32;
+  same_layout &= field_azimuth.count == 1;
+  const auto & field_elevation = fields.at(7);
+  same_layout &= field_elevation.name == "elevation";
+  same_layout &= field_elevation.offset == offsetof(InputPointType, elevation);
+  same_layout &= field_elevation.datatype == sensor_msgs::msg::PointField::FLOAT32;
+  same_layout &= field_elevation.count == 1;
+  const auto & field_distance = fields.at(8);
+  same_layout &= field_distance.name == "distance";
+  same_layout &= field_distance.offset == offsetof(InputPointType, distance);
+  same_layout &= field_distance.datatype == sensor_msgs::msg::PointField::FLOAT32;
+  same_layout &= field_distance.count == 1;
+  const auto & field_time_stamp = fields.at(9);
+  same_layout &= field_time_stamp.name == "time_stamp";
+  same_layout &= field_time_stamp.offset == offsetof(InputPointType, time_stamp);
+  same_layout &= field_time_stamp.datatype == sensor_msgs::msg::PointField::UINT32;
+  same_layout &= field_time_stamp.count == 1;
+  return same_layout;
+}
+
+}  // namespace autoware::cuda_pointcloud_preprocessor
+
+#endif  // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__MEMORY_HPP_
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
index b2db15491f369..bf6cc1f79890c 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
@@ -14,6 +14,7 @@
 
 #include "autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp"
 
+#include "autoware/cuda_pointcloud_preprocessor/memory.hpp"
 #include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
 
 #include <autoware/point_types/types.hpp>
@@ -207,6 +208,11 @@ void CudaPointcloudPreprocessorNode::imuCallback(
 void CudaPointcloudPreprocessorNode::pointcloudCallback(
   const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr)
 {
+  if (!is_data_layout_compatible_with_point_xyzircaedt(input_pointcloud_msg_ptr->fields)) {
+    RCLCPP_ERROR(
+      get_logger(), "Input pointcloud data layout is not compatible with PointXYZIRCAEDT");
+  }
+
   static_assert(
     sizeof(InputPointType) == sizeof(autoware::point_types::PointXYZIRCAEDT),
     "PointStruct and PointXYZIRCAEDT must have the same size");

From 303b9ede65c8a748a1dd495df78ee3eda8d1b066 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Wed, 5 Feb 2025 18:17:02 +0900
Subject: [PATCH 29/38] chore: refactored the structure to avoid large files

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../CMakeLists.txt                            |   4 +
 .../common_kernels.hpp                        |  45 ++
 .../cuda_pointcloud_preprocessor.hpp          |  94 +--
 .../organize_kernels.hpp                      |  42 ++
 .../outlier_kernels.hpp                       |  32 +
 .../cuda_pointcloud_preprocessor/types.hpp    | 104 ++++
 .../undistort_kernels.hpp                     |  53 ++
 .../common_kernels.cu                         | 128 ++++
 .../cuda_pointcloud_preprocessor.cu           | 579 ++----------------
 .../organize_kernels.cu                       | 108 ++++
 .../outlier_kernels.cu                        |  86 +++
 .../undistort_kernels.cu                      | 335 ++++++++++
 12 files changed, 974 insertions(+), 636 deletions(-)
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/common_kernels.hpp
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/organize_kernels.cu
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/outlier_kernels.cu
 create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
index 8abff042d6a3a..4a4fc505862bb 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
@@ -36,6 +36,10 @@ list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 20012") # c
 
 cuda_add_library(cuda_pointcloud_preprocessor_lib SHARED
   src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+  src/cuda_pointcloud_preprocessor/common_kernels.cu
+  src/cuda_pointcloud_preprocessor/organize_kernels.cu
+  src/cuda_pointcloud_preprocessor/outlier_kernels.cu
+  src/cuda_pointcloud_preprocessor/undistort_kernels.cu
 )
 
 target_link_libraries(cuda_pointcloud_preprocessor_lib
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/common_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/common_kernels.hpp
new file mode 100644
index 0000000000000..0e21e933569ee
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/common_kernels.hpp
@@ -0,0 +1,45 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__COMMON_KERNELS_HPP_
+#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__COMMON_KERNELS_HPP_
+
+#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/types.hpp"
+
+#include <cuda_runtime.h>
+
+namespace autoware::cuda_pointcloud_preprocessor
+{
+void transformPointsLaunch(
+  const InputPointType * input_points, InputPointType * output_points, int num_points,
+  TransformStruct transform, int threads_per_block, int blocks_per_grid, cudaStream_t & stream);
+
+void cropBoxLaunch(
+  InputPointType * d_points, std::uint32_t * output_mask, int num_points,
+  const CropBoxParameters * crop_box_parameters_ptr, int num_crop_boxes, int threads_per_block,
+  int blocks_per_grid, cudaStream_t & stream);
+
+void combineMasksLaunch(
+  const std::uint32_t * mask1, const std::uint32_t * mask2, int num_points,
+  std::uint32_t * output_mask, int threads_per_block, int blocks_per_grid, cudaStream_t & stream);
+
+void extractPointsLaunch(
+  InputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points,
+  OutputPointType * output_points, int threads_per_block, int blocks_per_grid,
+  cudaStream_t & stream);
+
+}  // namespace autoware::cuda_pointcloud_preprocessor
+
+#endif  // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__COMMON_KERNELS_HPP_
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
index 7c4c0113d3618..bf19949ddc795 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
@@ -16,6 +16,7 @@
 #define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_HPP_
 
 #include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/types.hpp"
 
 #include <Eigen/Core>
 #include <Eigen/Geometry>
@@ -28,88 +29,12 @@
 
 #include <thrust/device_vector.h>
 
-#include <cmath>
 #include <cstdint>
 #include <deque>
 
 namespace autoware::cuda_pointcloud_preprocessor
 {
 
-struct TwistStruct2D
-{
-  float cum_x;
-  float cum_y;
-  float cum_theta;
-  float cum_cos_theta;
-  float cum_sin_theta;
-  std::uint32_t last_stamp_nsec;  // relative to the start of the pointcloud
-  std::uint32_t stamp_nsec;       // relative to the start of the pointcloud
-  float v_x;
-  float v_theta;
-};
-
-struct TwistStruct3D
-{
-  float cum_transform_buffer[16];
-  std::uint32_t last_stamp_nsec;  // relative to the start of the pointcloud
-  std::uint32_t stamp_nsec;       // relative to the start of the pointcloud
-  float v[3];
-  float w[3];
-};
-
-struct TransformStruct
-{
-  float x;
-  float y;
-  float z;
-  float m11;
-  float m12;
-  float m13;
-  float m21;
-  float m22;
-  float m23;
-  float m31;
-  float m32;
-  float m33;
-};
-
-struct CropBoxParameters
-{
-  float min_x;
-  float max_x;
-  float min_y;
-  float max_y;
-  float min_z;
-  float max_z;
-};
-
-struct RingOutlierFilterParameters
-{
-  float distance_ratio{std::nanf("")};
-  float object_length_threshold{std::nanf("")};
-  std::size_t num_points_threshold{0};
-};
-
-template <typename T>
-class MemoryPoolAllocator
-{
-public:
-  using value_type = T;
-  MemoryPoolAllocator(cudaMemPool_t pool) : m_pool(pool) {}
-
-  T * allocate(std::size_t n)
-  {
-    void * ptr = nullptr;
-    cudaMallocFromPoolAsync(&ptr, n * sizeof(T), m_pool, cudaStreamDefault);
-    return static_cast<T *>(ptr);
-  }
-
-  void deallocate(T * ptr, std::size_t) { cudaFreeAsync(ptr, cudaStreamDefault); }
-
-protected:
-  cudaMemPool_t m_pool;
-};
-
 class CudaPointcloudPreprocessor
 {
 public:
@@ -130,20 +55,6 @@ class CudaPointcloudPreprocessor
     const std::uint32_t first_point_rel_stamp_nsec);
 
 private:
-  void setupTwist2DStructs(
-    const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
-    const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue,
-    const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec);
-
-  void setupTwist3DStructs(
-    const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
-    const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue,
-    const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec);
-
-  std::size_t querySortWorkspace(
-    int num_items, int num_segments, int * offsets_device, std::uint32_t * keys_in_device,
-    std::uint32_t * keys_out_device);
-
   void organizePointcloud();
 
   CropBoxParameters self_crop_box_parameters_{};
@@ -185,11 +96,8 @@ class CudaPointcloudPreprocessor
   thrust::device_vector<std::uint32_t> device_indices_{};
   thrust::device_vector<TwistStruct2D> device_twist_2d_structs_{};
   thrust::device_vector<TwistStruct3D> device_twist_3d_structs_{};
-
   thrust::device_vector<CropBoxParameters> host_crop_box_structs_{};
   thrust::device_vector<CropBoxParameters> device_crop_box_structs_{};
-  std::vector<TwistStruct2D> host_twist_2d_structs_;
-  std::vector<TwistStruct3D> host_twist_3d_structs_;
 };
 
 }  // namespace autoware::cuda_pointcloud_preprocessor
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp
new file mode 100644
index 0000000000000..693a001045d3c
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp
@@ -0,0 +1,42 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__ORGANIZE_KERNELS_HPP_
+#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__ORGANIZE_KERNELS_HPP_
+
+#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/types.hpp"
+
+#include <cuda_runtime.h>
+
+namespace autoware::cuda_pointcloud_preprocessor
+{
+
+std::size_t querySortWorkspace(
+  int num_items, int num_segments, int * offsets_device, std::uint32_t * keys_in_device,
+  std::uint32_t * keys_out_device);
+
+void organizeLaunch(
+  const InputPointType * input_points, std::uint32_t * index_tensor, std::int32_t * ring_indexes,
+  std::int32_t initial_max_rings, std::int32_t * output_max_rings,
+  std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring,
+  int num_points, int threads_per_block, int blocks_per_grid, cudaStream_t & stream);
+
+void gatherLaunch(
+  const InputPointType * input_points, const std::uint32_t * index_tensor,
+  InputPointType * output_points, int num_rings, int max_points_per_ring, int threads_per_block,
+  int blocks_per_grid, cudaStream_t & stream);
+}  // namespace autoware::cuda_pointcloud_preprocessor
+
+#endif  // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__ORGANIZE_KERNELS_HPP_
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp
new file mode 100644
index 0000000000000..2010127afdeb5
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp
@@ -0,0 +1,32 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__OUTLIER_KERNELS_HPP_
+#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__OUTLIER_KERNELS_HPP_
+
+#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/types.hpp"
+
+#include <cuda_runtime.h>
+
+namespace autoware::cuda_pointcloud_preprocessor
+{
+void ringOutlierFilterLaunch(
+  const InputPointType * points, std::uint32_t * output_mask, int num_rings,
+  int max_points_per_ring, float distance_ratio, float object_length_threshold_squared,
+  int num_points_threshold, int threads_per_block, int blocks_per_grid, cudaStream_t & stream);
+
+}  // namespace autoware::cuda_pointcloud_preprocessor
+
+#endif  // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__OUTLIER_KERNELS_HPP_
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp
new file mode 100644
index 0000000000000..4cb50c8c28e38
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp
@@ -0,0 +1,104 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__TYPES_HPP_
+#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__TYPES_HPP_
+
+#include <cuda_runtime.h>
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+
+namespace autoware::cuda_pointcloud_preprocessor
+{
+
+struct TwistStruct2D
+{
+  float cum_x;
+  float cum_y;
+  float cum_theta;
+  float cum_cos_theta;
+  float cum_sin_theta;
+  std::uint32_t last_stamp_nsec;  // relative to the start of the pointcloud
+  std::uint32_t stamp_nsec;       // relative to the start of the pointcloud
+  float v_x;
+  float v_theta;
+};
+
+struct TwistStruct3D
+{
+  float cum_transform_buffer[16];
+  std::uint32_t last_stamp_nsec;  // relative to the start of the pointcloud
+  std::uint32_t stamp_nsec;       // relative to the start of the pointcloud
+  float v[3];
+  float w[3];
+};
+
+struct TransformStruct
+{
+  float x;
+  float y;
+  float z;
+  float m11;
+  float m12;
+  float m13;
+  float m21;
+  float m22;
+  float m23;
+  float m31;
+  float m32;
+  float m33;
+};
+
+struct CropBoxParameters
+{
+  float min_x;
+  float max_x;
+  float min_y;
+  float max_y;
+  float min_z;
+  float max_z;
+};
+
+struct RingOutlierFilterParameters
+{
+  float distance_ratio{std::nanf("")};
+  float object_length_threshold{std::nanf("")};
+  std::size_t num_points_threshold{0};
+};
+
+template <typename T>
+class MemoryPoolAllocator
+{
+public:
+  using value_type = T;
+  MemoryPoolAllocator(cudaMemPool_t pool) : m_pool(pool) {}
+
+  T * allocate(std::size_t n)
+  {
+    void * ptr = nullptr;
+    cudaMallocFromPoolAsync(&ptr, n * sizeof(T), m_pool, cudaStreamDefault);
+    return static_cast<T *>(ptr);
+  }
+
+  void deallocate(T * ptr, std::size_t) { cudaFreeAsync(ptr, cudaStreamDefault); }
+
+protected:
+  cudaMemPool_t m_pool;
+};
+
+}  // namespace autoware::cuda_pointcloud_preprocessor
+
+#endif  // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__TYPES_HPP_
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp
new file mode 100644
index 0000000000000..4a2f91af12953
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp
@@ -0,0 +1,53 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__UNDISTORT_KERNELS_HPP_
+#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__UNDISTORT_KERNELS_HPP_
+
+#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/types.hpp"
+
+#include <geometry_msgs/msg/twist_with_covariance_stamped.hpp>
+#include <geometry_msgs/msg/vector3_stamped.hpp>
+
+#include <cuda_runtime.h>
+#include <thrust/device_vector.h>
+
+#include <deque>
+
+namespace autoware::cuda_pointcloud_preprocessor
+{
+void undistort2dLaunch(
+  InputPointType * input_points, int num_points, TwistStruct2D * twist_structs, int num_twists,
+  int threads_per_block, int blocks_per_grid, cudaStream_t & stream);
+
+void undistort3dLaunch(
+  InputPointType * input_points, int num_points, TwistStruct3D * twist_structs, int num_twists,
+  int threads_per_block, int blocks_per_grid, cudaStream_t & stream);
+
+void setupTwist2DStructs(
+  const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
+  const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue,
+  const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec,
+  thrust::device_vector<TwistStruct2D> & device_twist_2d_structs, cudaStream_t & stream);
+
+void setupTwist3DStructs(
+  const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
+  const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue,
+  const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec,
+  thrust::device_vector<TwistStruct3D> & device_twist_3d_structs, cudaStream_t & stream);
+
+}  // namespace autoware::cuda_pointcloud_preprocessor
+
+#endif  // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__UNDISTORT_KERNELS_HPP_
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu
new file mode 100644
index 0000000000000..64ed3ca8ab72f
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu
@@ -0,0 +1,128 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/cuda_pointcloud_preprocessor/common_kernels.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/types.hpp"
+
+namespace autoware::cuda_pointcloud_preprocessor
+{
+__global__ void transformPointsKernel(
+  const InputPointType * __restrict__ input_points, InputPointType * output_points, int num_points,
+  TransformStruct transform)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_points) {
+    output_points[idx] = input_points[idx];
+
+    const float x = input_points[idx].x;
+    const float y = input_points[idx].y;
+    const float z = input_points[idx].z;
+
+    output_points[idx].x = transform.m11 * x + transform.m12 * y + transform.m13 * z + transform.x;
+    output_points[idx].y = transform.m21 * x + transform.m22 * y + transform.m23 * z + transform.y;
+    output_points[idx].z = transform.m31 * x + transform.m32 * y + transform.m33 * z + transform.z;
+  }
+}
+
+__global__ void cropBoxKernel(
+  InputPointType * __restrict__ d_points, std::uint32_t * __restrict__ output_mask, int num_points,
+  const CropBoxParameters * __restrict__ crop_box_parameters_ptr, int num_crop_boxes)
+{
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_points;
+       idx += blockDim.x * gridDim.x) {
+    const float x = d_points[idx].x;
+    const float y = d_points[idx].y;
+    const float z = d_points[idx].z;
+
+    std::uint32_t mask = 1;
+
+    for (int i = 0; i < num_crop_boxes; i++) {
+      const CropBoxParameters & crop_box_parameters = crop_box_parameters_ptr[i];
+      const float & min_x = crop_box_parameters.min_x;
+      const float & min_y = crop_box_parameters.min_y;
+      const float & min_z = crop_box_parameters.min_z;
+      const float & max_x = crop_box_parameters.max_x;
+      const float & max_y = crop_box_parameters.max_y;
+      const float & max_z = crop_box_parameters.max_z;
+      mask &=
+        (x <= min_x || x >= max_x) || (y <= min_y || y >= max_y) || (z <= min_z || z >= max_z);
+    }
+
+    output_mask[idx] = mask;
+  }
+}
+
+__global__ void combineMasksKernel(
+  const std::uint32_t * __restrict__ mask1, const std::uint32_t * __restrict__ mask2,
+  int num_points, std::uint32_t * __restrict__ output_mask)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_points) {
+    output_mask[idx] = mask1[idx] & mask2[idx];
+  }
+}
+
+__global__ void extractPointsKernel(
+  InputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points,
+  OutputPointType * output_points)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_points && masks[idx] == 1) {
+    InputPointType & input_point = input_points[idx];
+    OutputPointType & output_point = output_points[indices[idx] - 1];
+    output_point.x = input_point.x;
+    output_point.y = input_point.y;
+    output_point.z = input_point.z;
+    output_point.intensity = input_point.intensity;
+    output_point.return_type = input_point.return_type;
+    output_point.channel = input_point.channel;
+  }
+}
+
+void transformPointsLaunch(
+  const InputPointType * input_points, InputPointType * output_points, int num_points,
+  TransformStruct transform, int threads_per_block, int blocks_per_grid, cudaStream_t & stream)
+{
+  transformPointsKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+    input_points, output_points, num_points, transform);
+}
+
+void cropBoxLaunch(
+  InputPointType * d_points, std::uint32_t * output_mask, int num_points,
+  const CropBoxParameters * crop_box_parameters_ptr, int num_crop_boxes, int threads_per_block,
+  int blocks_per_grid, cudaStream_t & stream)
+{
+  cropBoxKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+    d_points, output_mask, num_points, crop_box_parameters_ptr, num_crop_boxes);
+}
+
+void combineMasksLaunch(
+  const std::uint32_t * mask1, const std::uint32_t * mask2, int num_points,
+  std::uint32_t * output_mask, int threads_per_block, int blocks_per_grid, cudaStream_t & stream)
+{
+  combineMasksKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+    mask1, mask2, num_points, output_mask);
+}
+
+void extractPointsLaunch(
+  InputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points,
+  OutputPointType * output_points, int threads_per_block, int blocks_per_grid,
+  cudaStream_t & stream)
+{
+  extractPointsKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+    input_points, masks, indices, num_points, output_points);
+}
+
+}  // namespace autoware::cuda_pointcloud_preprocessor
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index ffc2216006d74..aac9f560a78ca 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -12,8 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "autoware/cuda_pointcloud_preprocessor/common_kernels.hpp"
 #include "autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp"
 #include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/types.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp"
 
 #include <Eigen/Core>
 #include <Eigen/Dense>
@@ -29,331 +34,6 @@
 namespace autoware::cuda_pointcloud_preprocessor
 {
 
-__host__ __device__ Eigen::Matrix3f skewSymmetric(const Eigen::Vector3f & v)
-{
-  Eigen::Matrix3f m;
-  m << 0, -v.z(), v.y(), v.z(), 0, -v.x(), -v.y(), v.x(), 0;
-  return m;
-}
-
-__host__ __device__ Eigen::Matrix3f leftJacobianSO3(const Eigen::Vector3f & omega)
-{
-  double theta = omega.norm();
-  if (std::abs(theta) < 1e-6) {
-    return Eigen::Matrix3f::Identity();
-  }
-
-  Eigen::Matrix3f Omega = skewSymmetric(omega);
-
-  Eigen::Matrix3f Omega2 = Omega * Omega;
-  double theta2 = theta * theta;
-  double theta3 = theta2 * theta;
-
-  // Rodrigues' formula for Jacobian
-  return Eigen::Matrix3f::Identity() + ((1 - cos(theta)) / theta2) * Omega +
-         ((theta - sin(theta)) / theta3) * Omega2;
-}
-
-__host__ __device__ Eigen::Matrix4f transformationMatrixFromVelocity(
-  const Eigen::Vector3f & linear_velocity, const Eigen::Vector3f & angular_velocity, double dt)
-{
-  Eigen::Matrix3f R = Eigen::AngleAxisf(angular_velocity.norm() * dt, angular_velocity.normalized())
-                        .toRotationMatrix();
-  Eigen::Matrix3f J = leftJacobianSO3(angular_velocity * dt);
-
-  Eigen::Vector3f translation = J * (linear_velocity * dt);
-
-  Eigen::Matrix4f transformation = Eigen::Matrix4f::Identity();
-  transformation.block<3, 3>(0, 0) = R;
-  transformation.block<3, 1>(0, 3) = translation;
-
-  return transformation;
-}
-
-__global__ void organizeKernel(
-  const InputPointType * __restrict__ input_points, std::uint32_t * index_tensor,
-  std::int32_t * ring_indexes, std::int32_t initial_max_rings, std::int32_t * output_max_rings,
-  std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring,
-  int num_points)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= num_points) {
-    return;
-  }
-
-  auto ring = input_points[idx].channel;
-
-  if (ring >= initial_max_rings) {
-    atomicMax(output_max_rings, ring);
-    return;
-  }
-
-  int next_offset = atomicAdd(&ring_indexes[ring], 1);
-
-  if (next_offset >= initial_max_points_per_ring) {
-    atomicMax(output_max_points_per_ring, next_offset);
-    return;
-  }
-
-  index_tensor[ring * initial_max_points_per_ring + next_offset] = idx;
-}
-
-__global__ void gatherKernel(
-  const InputPointType * __restrict__ input_points, const std::uint32_t * __restrict__ index_tensor,
-  InputPointType * __restrict__ output_points, int num_rings, int max_points_per_ring)
-{
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= num_rings * max_points_per_ring) {
-    return;
-  }
-
-  const int ring = idx / max_points_per_ring;
-  const int point = idx % max_points_per_ring;
-
-  const std::uint32_t input_idx = index_tensor[ring * max_points_per_ring + point];
-
-  if (input_idx < std::numeric_limits<std::uint32_t>::max()) {
-    output_points[ring * max_points_per_ring + point] = input_points[input_idx];
-  } else {
-    output_points[ring * max_points_per_ring + point].distance = 0.0f;
-  }
-}
-
-__global__ void transformPointsKernel(
-  const InputPointType * __restrict__ input_points, InputPointType * output_points, int num_points,
-  TransformStruct transform)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_points) {
-    output_points[idx] = input_points[idx];
-
-    const float x = input_points[idx].x;
-    const float y = input_points[idx].y;
-    const float z = input_points[idx].z;
-
-    output_points[idx].x = transform.m11 * x + transform.m12 * y + transform.m13 * z + transform.x;
-    output_points[idx].y = transform.m21 * x + transform.m22 * y + transform.m23 * z + transform.y;
-    output_points[idx].z = transform.m31 * x + transform.m32 * y + transform.m33 * z + transform.z;
-  }
-}
-
-__global__ void cropBoxKernel(
-  InputPointType * __restrict__ d_points, std::uint32_t * __restrict__ output_mask, int num_points,
-  const CropBoxParameters * __restrict__ crop_box_parameters_ptr, int num_crop_boxes)
-{
-  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_points;
-       idx += blockDim.x * gridDim.x) {
-    const float x = d_points[idx].x;
-    const float y = d_points[idx].y;
-    const float z = d_points[idx].z;
-
-    std::uint32_t mask = 1;
-
-    for (int i = 0; i < num_crop_boxes; i++) {
-      const CropBoxParameters & crop_box_parameters = crop_box_parameters_ptr[i];
-      const float & min_x = crop_box_parameters.min_x;
-      const float & min_y = crop_box_parameters.min_y;
-      const float & min_z = crop_box_parameters.min_z;
-      const float & max_x = crop_box_parameters.max_x;
-      const float & max_y = crop_box_parameters.max_y;
-      const float & max_z = crop_box_parameters.max_z;
-      mask &=
-        (x <= min_x || x >= max_x) || (y <= min_y || y >= max_y) || (z <= min_z || z >= max_z);
-    }
-
-    output_mask[idx] = mask;
-  }
-}
-
-__global__ void combineMasksKernel(
-  const std::uint32_t * __restrict__ mask1, const std::uint32_t * __restrict__ mask2,
-  int num_points, std::uint32_t * __restrict__ output_mask)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_points) {
-    output_mask[idx] = mask1[idx] & mask2[idx];
-  }
-}
-
-__global__ void extractInputPointIndicesKernel(
-  InputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points,
-  InputPointType * output_points)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_points && masks[idx] == 1) {
-    output_points[indices[idx] - 1] = input_points[idx];
-  }
-}
-
-__global__ void extractOutputPointIndicesKernel(
-  OutputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points,
-  OutputPointType * output_points)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_points && masks[idx] == 1) {
-    output_points[indices[idx] - 1] = input_points[idx];
-  }
-}
-
-__global__ void extractInputPointsToOutputPoints_indicesKernel(
-  InputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points,
-  OutputPointType * output_points)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_points && masks[idx] == 1) {
-    InputPointType & input_point = input_points[idx];
-    OutputPointType & output_point = output_points[indices[idx] - 1];
-    output_point.x = input_point.x;
-    output_point.y = input_point.y;
-    output_point.z = input_point.z;
-    output_point.intensity = input_point.intensity;
-    output_point.return_type = input_point.return_type;
-    output_point.channel = input_point.channel;
-  }
-}
-
-__global__ void undistort2dKernel(
-  InputPointType * input_points, int num_points, TwistStruct2D * twist_structs, int num_twists)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_points) {
-    InputPointType & point = input_points[idx];
-
-    // The twist must always be newer than the point ! (or it was the last twist)
-    int twist_index = 0;
-    while (twist_index < num_twists && twist_structs[twist_index].stamp_nsec < point.time_stamp) {
-      twist_index++;
-    }
-
-    twist_index = min(twist_index, num_twists - 1);
-
-    TwistStruct2D twist = twist_structs[twist_index];
-    float x = twist.cum_x;
-    float y = twist.cum_y;
-    float theta = twist.cum_theta;
-
-    double dt_nsec =
-      point.time_stamp > twist.last_stamp_nsec ? point.time_stamp - twist.last_stamp_nsec : 0;
-    double dt = 1e-9 * (dt_nsec);
-
-    theta += twist.v_theta * dt;
-    float d = twist.v_x * dt;
-    x += d * cos(theta);
-    y += d * sin(theta);
-
-    float distorted_x = point.x;
-    float distorted_y = point.y;
-
-    point.x = distorted_x * cos(theta) - distorted_y * sin(theta) + x;
-    point.y = distorted_x * sin(theta) + distorted_y * cos(theta) + y;
-  }
-}
-
-__global__ void undistort3dKernel(
-  InputPointType * input_points, int num_points, TwistStruct3D * twist_structs, int num_twists)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_points) {
-    InputPointType & point = input_points[idx];
-
-    // The twist must always be newer than the point ! (or it was the last twist)
-    int twist_index = 0;
-    while (twist_index < num_twists && twist_structs[twist_index].stamp_nsec < point.time_stamp) {
-      twist_index++;
-    }
-
-    twist_index = min(twist_index, num_twists - 1);
-
-    TwistStruct3D twist = twist_structs[twist_index];
-    Eigen::Map<Eigen::Matrix4f> cum_transform_buffer_map(twist.cum_transform_buffer);
-    Eigen::Map<Eigen::Vector3f> v_map(twist.v);
-    Eigen::Map<Eigen::Vector3f> w_map(twist.w);
-
-    double dt_nsec =
-      point.time_stamp > twist.last_stamp_nsec ? point.time_stamp - twist.last_stamp_nsec : 0;
-    double dt = 1e-9 * (dt_nsec);
-
-    Eigen::Matrix4f transform =
-      cum_transform_buffer_map * transformationMatrixFromVelocity(v_map, w_map, dt);
-    Eigen::Vector3f p(point.x, point.y, point.z);
-    p = transform.block<3, 3>(0, 0) * p + transform.block<3, 1>(0, 3);
-
-    point.x = p.x();
-    point.y = p.y();
-    point.z = p.z();
-  }
-}
-
-__global__ void ringOutlierFilterKernel(
-  const InputPointType * d_points, std::uint32_t * output_mask, int num_rings,
-  int max_points_per_ring, float distance_ratio, float object_length_threshold_squared,
-  int num_points_threshold)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = idx / max_points_per_ring;
-  int i = idx % max_points_per_ring;
-
-  if (j >= num_rings || i >= max_points_per_ring) {
-    return;
-  }
-
-  int min_i = max(i - num_points_threshold, 0);
-  int max_i = min(i + num_points_threshold, max_points_per_ring);
-
-  int walk_size = 1;
-  int left_idx = min_i;
-  int right_idx = min_i + 1;
-
-  for (int k = min_i; k < max_i - 1; k++) {
-    const InputPointType & left_point = d_points[j * max_points_per_ring + k];
-    const InputPointType & right_point = d_points[j * max_points_per_ring + k + 1];
-
-    // Find biggest walk that passes through i
-    float azimuth_diff = right_point.azimuth - left_point.azimuth;
-    azimuth_diff = azimuth_diff < 0.f ? azimuth_diff + 2 * M_PI : azimuth_diff;
-
-    if (
-      max(left_point.distance, right_point.distance) <
-        min(left_point.distance, right_point.distance) * distance_ratio &&
-      azimuth_diff < 1.f * M_PI / 180.f) {
-      // Determined to be included in the same walk
-      walk_size++;
-      right_idx++;
-    } else if (k >= i) {
-      break;
-    } else {
-      walk_size = 1;
-      left_idx = k + 1;
-      right_idx = k + 2;  // this is safe since we break if k >= i
-    }
-  }
-
-  const InputPointType & left_point = d_points[j * max_points_per_ring + left_idx];
-  const InputPointType & right_point = d_points[j * max_points_per_ring + right_idx - 1];
-  const float x = left_point.x - right_point.x;
-  const float y = left_point.y - right_point.y;
-  const float z = left_point.z - right_point.z;
-
-  output_mask[j * max_points_per_ring + i] = static_cast<std::uint32_t>(
-    (walk_size > num_points_threshold) ||
-    (x * x + y * y + z * z >= object_length_threshold_squared));
-}
-
-__global__ void transformPointTypeKernel(
-  const InputPointType * device_input_points, int num_points, OutputPointType * device_ouput_points)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_points) {
-    const InputPointType & input_point = device_input_points[idx];
-    OutputPointType & output_point = device_ouput_points[idx];
-
-    output_point.x = input_point.x;
-    output_point.y = input_point.y;
-    output_point.z = input_point.z;
-    output_point.intensity = (float)input_point.intensity;
-  }
-}
-
 CudaPointcloudPreprocessor::CudaPointcloudPreprocessor()
 {
   sensor_msgs::msg::PointField x_field, y_field, z_field, intensity_field, return_type_field,
@@ -514,199 +194,6 @@ void CudaPointcloudPreprocessor::preallocateOutput()
     num_rings_ * max_points_per_ring_ * sizeof(OutputPointType));
 }
 
-void CudaPointcloudPreprocessor::setupTwist2DStructs(
-  const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
-  const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue,
-  const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec)
-{
-  host_twist_2d_structs_.clear();
-  host_twist_2d_structs_.reserve(twist_queue.size() + angular_velocity_queue.size());
-
-  // Twist preprocessing
-  float cum_x = 0;
-  float cum_y = 0;
-  float cum_theta = 0;
-  // All time stamps from now on are in nsec from the "beginning of the pointcloud"
-  std::uint32_t last_stamp_nsec = first_point_rel_stamp_nsec;
-
-  std::size_t twist_index = 0;
-  std::size_t angular_velocity_index = 0;
-
-  for (; twist_index < twist_queue.size() ||
-         angular_velocity_index < angular_velocity_queue.size();) {
-    std::uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec;
-    float v_x, v_theta;
-
-    if (twist_index < twist_queue.size()) {
-      input_twist_global_stamp_nsec =
-        1'000'000'000 * static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.sec) +
-        static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.nanosec);
-      v_x = twist_queue[twist_index].twist.twist.linear.x;
-    } else {
-      input_twist_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max();
-      v_x = 0.0;
-    }
-
-    if (angular_velocity_index < angular_velocity_queue.size()) {
-      angular_velocity_global_stamp_nsec =
-        1'000'000'000 * static_cast<std::uint64_t>(
-                          angular_velocity_queue[angular_velocity_index].header.stamp.sec) +
-        static_cast<std::uint64_t>(
-          angular_velocity_queue[angular_velocity_index].header.stamp.nanosec);
-      v_theta = angular_velocity_queue[angular_velocity_index].vector.z;
-    } else {
-      angular_velocity_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max();
-      v_theta = 0.0;
-    }
-
-    if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) {
-      twist_stamp = input_twist_global_stamp_nsec;
-      twist_index++;
-    } else if (input_twist_global_stamp_nsec > angular_velocity_global_stamp_nsec) {
-      twist_stamp = angular_velocity_global_stamp_nsec;
-      angular_velocity_index++;
-    } else {
-      twist_index++;
-      angular_velocity_index++;
-    }
-
-    TwistStruct2D twist;
-    twist.cum_x = cum_x;
-    twist.cum_y = cum_y;
-    twist.cum_theta = cum_theta;
-
-    std::uint64_t twist_global_stamp_nsec = twist_stamp;
-    assert(twist_global_stamp_nsec > pointcloud_stamp_nsec);  // by construction
-    std::uint32_t twist_from_pointcloud_start_nsec =
-      twist_global_stamp_nsec - pointcloud_stamp_nsec;
-
-    twist.stamp_nsec = twist_from_pointcloud_start_nsec;
-    twist.v_x = v_x;
-    twist.v_theta = v_theta;
-    twist.last_stamp_nsec = last_stamp_nsec;
-    host_twist_2d_structs_.push_back(twist);
-
-    double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec);
-    last_stamp_nsec = twist.stamp_nsec;
-    cum_theta += v_theta * dt_seconds;
-    float d = twist.v_x * dt_seconds;
-    cum_x += d * cos(cum_theta);
-    cum_y += d * sin(cum_theta);
-  }
-
-  // Copy to device
-  device_twist_2d_structs_.resize(host_twist_2d_structs_.size());
-  cudaMemcpyAsync(
-    thrust::raw_pointer_cast(device_twist_2d_structs_.data()), host_twist_2d_structs_.data(),
-    host_twist_2d_structs_.size() * sizeof(TwistStruct2D), cudaMemcpyHostToDevice, stream_);
-}
-
-void CudaPointcloudPreprocessor::setupTwist3DStructs(
-  const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
-  const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue,
-  const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec)
-{
-  // Twist preprocessing
-  host_twist_3d_structs_.clear();
-  host_twist_3d_structs_.reserve(twist_queue.size() + angular_velocity_queue.size());
-
-  Eigen::Matrix4f cum_transform = Eigen::Matrix4f::Identity();
-  Eigen::Vector3f v = Eigen::Vector3f::Zero();
-  Eigen::Vector3f w = Eigen::Vector3f::Zero();
-
-  // All time stamps from now on are in nsec from the "beginning of the pointcloud"
-  std::uint32_t last_stamp_nsec = first_point_rel_stamp_nsec;
-
-  std::size_t twist_index = 0;
-  std::size_t angular_velocity_index = 0;
-
-  for (; twist_index < twist_queue.size() ||
-         angular_velocity_index < angular_velocity_queue.size();) {
-    std::uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec;
-
-    if (twist_index < twist_queue.size()) {
-      input_twist_global_stamp_nsec =
-        1'000'000'000 * static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.sec) +
-        static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.nanosec);
-      v.x() = twist_queue[twist_index].twist.twist.linear.x;
-      v.y() = twist_queue[twist_index].twist.twist.linear.y;
-      v.z() = twist_queue[twist_index].twist.twist.linear.z;
-    } else {
-      input_twist_global_stamp_nsec = std::numeric_limits<uint64_t>::max();
-      v = Eigen::Vector3f::Zero();
-    }
-
-    if (angular_velocity_index < angular_velocity_queue.size()) {
-      angular_velocity_global_stamp_nsec =
-        1'000'000'000 * static_cast<std::uint64_t>(
-                          angular_velocity_queue[angular_velocity_index].header.stamp.sec) +
-        static_cast<std::uint64_t>(
-          angular_velocity_queue[angular_velocity_index].header.stamp.nanosec);
-      w.x() = angular_velocity_queue[angular_velocity_index].vector.x;
-      w.y() = angular_velocity_queue[angular_velocity_index].vector.y;
-      w.z() = angular_velocity_queue[angular_velocity_index].vector.z;
-    } else {
-      angular_velocity_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max();
-      w = Eigen::Vector3f::Zero();
-    }
-
-    if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) {
-      twist_stamp = input_twist_global_stamp_nsec;
-      twist_index++;
-    } else if (input_twist_global_stamp_nsec > angular_velocity_global_stamp_nsec) {
-      twist_stamp = angular_velocity_global_stamp_nsec;
-      angular_velocity_index++;
-    } else {
-      twist_index++;
-      angular_velocity_index++;
-    }
-
-    TwistStruct3D twist;
-
-    Eigen::Map<Eigen::Matrix4f> cum_transform_buffer_map(twist.cum_transform_buffer);
-    Eigen::Map<Eigen::Vector3f> v_map(twist.v);
-    Eigen::Map<Eigen::Vector3f> w_map(twist.w);
-    cum_transform_buffer_map = cum_transform;
-
-    std::uint64_t twist_global_stamp_nsec = twist_stamp;
-    assert(twist_global_stamp_nsec > pointcloud_stamp_nsec);  // by construction
-    std::uint32_t twist_from_pointcloud_start_nsec =
-      twist_global_stamp_nsec - pointcloud_stamp_nsec;
-
-    twist.stamp_nsec = twist_from_pointcloud_start_nsec;
-    v_map = v;
-    w_map = w;
-    twist.last_stamp_nsec = last_stamp_nsec;
-    host_twist_3d_structs_.push_back(twist);
-
-    double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec);
-    last_stamp_nsec = twist.stamp_nsec;
-
-    auto delta_transform = transformationMatrixFromVelocity(v, w, dt_seconds);
-    cum_transform = cum_transform * delta_transform;
-  }
-
-  // Copy to device
-  device_twist_3d_structs_.resize(host_twist_3d_structs_.size());
-  cudaMemcpyAsync(
-    thrust::raw_pointer_cast(device_twist_3d_structs_.data()), host_twist_3d_structs_.data(),
-    host_twist_3d_structs_.size() * sizeof(TwistStruct3D), cudaMemcpyHostToDevice, stream_);
-}
-
-std::size_t CudaPointcloudPreprocessor::querySortWorkspace(
-  int num_items, int num_segments, int * offsets_device, std::uint32_t * keys_in_device,
-  std::uint32_t * keys_out_device)
-{
-  // Determine temporary device storage requirements
-  void * temp_storage = nullptr;
-  size_t temp_storage_bytes = 0;
-  cub::DeviceSegmentedRadixSort::SortKeys(
-    temp_storage, temp_storage_bytes, keys_in_device, keys_out_device, num_items, num_segments,
-    offsets_device, offsets_device + 1);
-
-  return temp_storage_bytes;
-}
-
 void CudaPointcloudPreprocessor::organizePointcloud()
 {
   cudaMemsetAsync(
@@ -723,12 +210,13 @@ void CudaPointcloudPreprocessor::organizePointcloud()
   const int raw_points_blocks_per_grid =
     (num_raw_points_ + threads_per_block_ - 1) / threads_per_block_;
 
-  organizeKernel<<<raw_points_blocks_per_grid, threads_per_block_, 0, stream_>>>(
+  organizeLaunch(
     thrust::raw_pointer_cast(device_input_points_.data()),
     thrust::raw_pointer_cast(device_indexes_tensor_.data()),
     thrust::raw_pointer_cast(device_ring_index_.data()), num_rings_,
     thrust::raw_pointer_cast(device_max_ring_.data()), max_points_per_ring_,
-    thrust::raw_pointer_cast(device_max_points_per_ring_.data()), num_raw_points_);
+    thrust::raw_pointer_cast(device_max_points_per_ring_.data()), num_raw_points_,
+    threads_per_block_, raw_points_blocks_per_grid, stream_);
 
   std::int32_t max_ring_value;
   std::int32_t max_points_per_ring;
@@ -781,12 +269,13 @@ void CudaPointcloudPreprocessor::organizePointcloud()
       thrust::raw_pointer_cast(device_sorted_indexes_tensor_.data()));
     device_sort_workspace_.resize(sort_workspace_bytes_);
 
-    organizeKernel<<<raw_points_blocks_per_grid, threads_per_block_, 0, stream_>>>(
+    organizeLaunch(
       thrust::raw_pointer_cast(device_input_points_.data()),
       thrust::raw_pointer_cast(device_indexes_tensor_.data()),
       thrust::raw_pointer_cast(device_ring_index_.data()), num_rings_,
       thrust::raw_pointer_cast(device_max_ring_.data()), max_points_per_ring_,
-      thrust::raw_pointer_cast(device_max_points_per_ring_.data()), num_raw_points_);
+      thrust::raw_pointer_cast(device_max_points_per_ring_.data()), num_raw_points_,
+      threads_per_block_, raw_points_blocks_per_grid, stream_);
   }
 
   if (num_organized_points_ == num_rings_) {
@@ -806,10 +295,12 @@ void CudaPointcloudPreprocessor::organizePointcloud()
 
   const int organized_points_blocks_per_grid =
     (num_organized_points_ + threads_per_block_ - 1) / threads_per_block_;
-  gatherKernel<<<organized_points_blocks_per_grid, threads_per_block_, 0, stream_>>>(
+
+  gatherLaunch(
     thrust::raw_pointer_cast(device_input_points_.data()),
     thrust::raw_pointer_cast(device_sorted_indexes_tensor_.data()),
-    thrust::raw_pointer_cast(device_organized_points_.data()), num_rings_, max_points_per_ring_);
+    thrust::raw_pointer_cast(device_organized_points_.data()), num_rings_, max_points_per_ring_,
+    threads_per_block_, organized_points_blocks_per_grid, stream_);
 }
 
 std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::process(
@@ -862,14 +353,14 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr
 
   if (use_3d_undistortion_) {
     setupTwist3DStructs(
-      twist_queue, angular_velocity_queue, pointcloud_stamp_nsec, first_point_rel_stamp_nsec);
+      twist_queue, angular_velocity_queue, pointcloud_stamp_nsec, first_point_rel_stamp_nsec,
+      device_twist_3d_structs_, stream_);
   } else {
     setupTwist2DStructs(
-      twist_queue, angular_velocity_queue, pointcloud_stamp_nsec, first_point_rel_stamp_nsec);
+      twist_queue, angular_velocity_queue, pointcloud_stamp_nsec, first_point_rel_stamp_nsec,
+      device_twist_2d_structs_, stream_);
   }
 
-  // check_error(frame_id, "After setup");
-
   // Obtain raw pointers for the kernels
   TwistStruct2D * device_twist_2d_structs =
     thrust::raw_pointer_cast(device_twist_2d_structs_.data());
@@ -884,38 +375,40 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr
 
   const int blocks_per_grid = (num_organized_points_ + threads_per_block_ - 1) / threads_per_block_;
 
-  transformPointsKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>(
+  transformPointsLaunch(
     thrust::raw_pointer_cast(device_organized_points_.data()), device_transformed_points,
-    num_organized_points_, transform_struct);
+    num_organized_points_, transform_struct, threads_per_block_, blocks_per_grid, stream_);
 
   int crop_box_blocks_per_grid = std::min(blocks_per_grid, max_blocks_per_grid_);
   if (host_crop_box_structs_.size() > 0) {
-    cropBoxKernel<<<crop_box_blocks_per_grid, threads_per_block_, 0, stream_>>>(
+    cropBoxLaunch(
       device_transformed_points, device_crop_mask, num_organized_points_,
-      thrust::raw_pointer_cast(device_crop_box_structs_.data()), host_crop_box_structs_.size());
+      thrust::raw_pointer_cast(device_crop_box_structs_.data()), host_crop_box_structs_.size(),
+      crop_box_blocks_per_grid, threads_per_block_, stream_);
   } else {
     thrust::fill(thrust::device, device_crop_mask, device_crop_mask + num_organized_points_, 1);
   }
 
   if (use_3d_undistortion_ && device_twist_3d_structs_.size() > 0) {
-    undistort3dKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>(
+    undistort3dLaunch(
       device_transformed_points, num_organized_points_, device_twist_3d_structs,
-      device_twist_3d_structs_.size());
+      device_twist_3d_structs_.size(), threads_per_block_, blocks_per_grid, stream_);
   } else if (!use_3d_undistortion_ && device_twist_2d_structs_.size() > 0) {
-    undistort2dKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>(
+    undistort2dLaunch(
       device_transformed_points, num_organized_points_, device_twist_2d_structs,
-      device_twist_2d_structs_.size());
+      device_twist_2d_structs_.size(), threads_per_block_, blocks_per_grid, stream_);
   }
 
-  ringOutlierFilterKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>(
+  ringOutlierFilterLaunch(
     device_transformed_points, device_ring_outlier_mask, num_rings_, max_points_per_ring_,
     ring_outlier_parameters_.distance_ratio,
     ring_outlier_parameters_.object_length_threshold *
       ring_outlier_parameters_.object_length_threshold,
-    ring_outlier_parameters_.num_points_threshold);
+    ring_outlier_parameters_.num_points_threshold, threads_per_block_, blocks_per_grid, stream_);
 
-  combineMasksKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>(
-    device_crop_mask, device_ring_outlier_mask, num_organized_points_, device_ring_outlier_mask);
+  combineMasksLaunch(
+    device_crop_mask, device_ring_outlier_mask, num_organized_points_, device_ring_outlier_mask,
+    threads_per_block_, blocks_per_grid, stream_);
 
   thrust::inclusive_scan(
     thrust::device, device_ring_outlier_mask, device_ring_outlier_mask + num_organized_points_,
@@ -928,10 +421,10 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr
   cudaStreamSynchronize(stream_);
 
   if (num_output_points > 0) {
-    extractInputPointsToOutputPoints_indicesKernel<<<
-      blocks_per_grid, threads_per_block_, 0, stream_>>>(
+    extractPointsLaunch(
       device_transformed_points, device_ring_outlier_mask, device_indices, num_organized_points_,
-      reinterpret_cast<OutputPointType *>(output_pointcloud_ptr_->data.get()));
+      reinterpret_cast<OutputPointType *>(output_pointcloud_ptr_->data.get()), threads_per_block_,
+      blocks_per_grid, stream_);
   }
 
   cudaStreamSynchronize(stream_);
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/organize_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/organize_kernels.cu
new file mode 100644
index 0000000000000..00ab2ddc1aec5
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/organize_kernels.cu
@@ -0,0 +1,108 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
+
+#include <cub/cub.cuh>
+
+#include <limits>
+
+namespace autoware::cuda_pointcloud_preprocessor
+{
+
+__global__ void organizeKernel(
+  const InputPointType * __restrict__ input_points, std::uint32_t * index_tensor,
+  std::int32_t * ring_indexes, std::int32_t initial_max_rings, std::int32_t * output_max_rings,
+  std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring,
+  int num_points)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= num_points) {
+    return;
+  }
+
+  auto ring = input_points[idx].channel;
+
+  if (ring >= initial_max_rings) {
+    atomicMax(output_max_rings, ring);
+    return;
+  }
+
+  int next_offset = atomicAdd(&ring_indexes[ring], 1);
+
+  if (next_offset >= initial_max_points_per_ring) {
+    atomicMax(output_max_points_per_ring, next_offset);
+    return;
+  }
+
+  index_tensor[ring * initial_max_points_per_ring + next_offset] = idx;
+}
+
+__global__ void gatherKernel(
+  const InputPointType * __restrict__ input_points, const std::uint32_t * __restrict__ index_tensor,
+  InputPointType * __restrict__ output_points, int num_rings, int max_points_per_ring)
+{
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= num_rings * max_points_per_ring) {
+    return;
+  }
+
+  const int ring = idx / max_points_per_ring;
+  const int point = idx % max_points_per_ring;
+
+  const std::uint32_t input_idx = index_tensor[ring * max_points_per_ring + point];
+
+  if (input_idx < std::numeric_limits<std::uint32_t>::max()) {
+    output_points[ring * max_points_per_ring + point] = input_points[input_idx];
+  } else {
+    output_points[ring * max_points_per_ring + point].distance = 0.0f;
+  }
+}
+
+std::size_t querySortWorkspace(
+  int num_items, int num_segments, int * offsets_device, std::uint32_t * keys_in_device,
+  std::uint32_t * keys_out_device)
+{
+  // Determine temporary device storage requirements
+  void * temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceSegmentedRadixSort::SortKeys(
+    temp_storage, temp_storage_bytes, keys_in_device, keys_out_device, num_items, num_segments,
+    offsets_device, offsets_device + 1);
+
+  return temp_storage_bytes;
+}
+
+void organizeLaunch(
+  const InputPointType * input_points, std::uint32_t * index_tensor, std::int32_t * ring_indexes,
+  std::int32_t initial_max_rings, std::int32_t * output_max_rings,
+  std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring,
+  int num_points, int threads_per_block, int blocks_per_grid, cudaStream_t & stream)
+{
+  organizeKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+    input_points, index_tensor, ring_indexes, initial_max_rings, output_max_rings,
+    initial_max_points_per_ring, output_max_points_per_ring, num_points);
+}
+
+void gatherLaunch(
+  const InputPointType * input_points, const std::uint32_t * index_tensor,
+  InputPointType * output_points, int num_rings, int max_points_per_ring, int threads_per_block,
+  int blocks_per_grid, cudaStream_t & stream)
+{
+  gatherKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+    input_points, index_tensor, output_points, num_rings, max_points_per_ring);
+}
+
+}  // namespace autoware::cuda_pointcloud_preprocessor
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/outlier_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/outlier_kernels.cu
new file mode 100644
index 0000000000000..18b116535a32c
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/outlier_kernels.cu
@@ -0,0 +1,86 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
+
+namespace autoware::cuda_pointcloud_preprocessor
+{
+
+__global__ void ringOutlierFilterKernel(
+  const InputPointType * points, std::uint32_t * output_mask, int num_rings,
+  int max_points_per_ring, float distance_ratio, float object_length_threshold_squared,
+  int num_points_threshold)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = idx / max_points_per_ring;
+  int i = idx % max_points_per_ring;
+
+  if (j >= num_rings || i >= max_points_per_ring) {
+    return;
+  }
+
+  int min_i = max(i - num_points_threshold, 0);
+  int max_i = min(i + num_points_threshold, max_points_per_ring);
+
+  int walk_size = 1;
+  int left_idx = min_i;
+  int right_idx = min_i + 1;
+
+  for (int k = min_i; k < max_i - 1; k++) {
+    const InputPointType & left_point = points[j * max_points_per_ring + k];
+    const InputPointType & right_point = points[j * max_points_per_ring + k + 1];
+
+    // Find biggest walk that passes through i
+    float azimuth_diff = right_point.azimuth - left_point.azimuth;
+    azimuth_diff = azimuth_diff < 0.f ? azimuth_diff + 2 * M_PI : azimuth_diff;
+
+    if (
+      max(left_point.distance, right_point.distance) <
+        min(left_point.distance, right_point.distance) * distance_ratio &&
+      azimuth_diff < 1.f * M_PI / 180.f) {
+      // Determined to be included in the same walk
+      walk_size++;
+      right_idx++;
+    } else if (k >= i) {
+      break;
+    } else {
+      walk_size = 1;
+      left_idx = k + 1;
+      right_idx = k + 2;  // this is safe since we break if k >= i
+    }
+  }
+
+  const InputPointType & left_point = points[j * max_points_per_ring + left_idx];
+  const InputPointType & right_point = points[j * max_points_per_ring + right_idx - 1];
+  const float x = left_point.x - right_point.x;
+  const float y = left_point.y - right_point.y;
+  const float z = left_point.z - right_point.z;
+
+  output_mask[j * max_points_per_ring + i] = static_cast<std::uint32_t>(
+    (walk_size > num_points_threshold) ||
+    (x * x + y * y + z * z >= object_length_threshold_squared));
+}
+
+void ringOutlierFilterLaunch(
+  const InputPointType * points, std::uint32_t * output_mask, int num_rings,
+  int max_points_per_ring, float distance_ratio, float object_length_threshold_squared,
+  int num_points_threshold, int threads_per_block, int blocks_per_grid, cudaStream_t & stream)
+{
+  ringOutlierFilterKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+    points, output_mask, num_rings, max_points_per_ring, distance_ratio,
+    object_length_threshold_squared, num_points_threshold);
+}
+
+}  // namespace autoware::cuda_pointcloud_preprocessor
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu
new file mode 100644
index 0000000000000..6767b03aebf56
--- /dev/null
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu
@@ -0,0 +1,335 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/types.hpp"
+#include "autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp"
+
+#include <Eigen/Core>
+#include <Eigen/Dense>
+#include <Eigen/Geometry>
+
+namespace autoware::cuda_pointcloud_preprocessor
+{
+
+__host__ __device__ Eigen::Matrix3f skewSymmetric(const Eigen::Vector3f & v)
+{
+  Eigen::Matrix3f m;
+  m << 0, -v.z(), v.y(), v.z(), 0, -v.x(), -v.y(), v.x(), 0;
+  return m;
+}
+
+__host__ __device__ Eigen::Matrix3f leftJacobianSO3(const Eigen::Vector3f & omega)
+{
+  double theta = omega.norm();
+  if (std::abs(theta) < 1e-6) {
+    return Eigen::Matrix3f::Identity();
+  }
+
+  Eigen::Matrix3f Omega = skewSymmetric(omega);
+
+  Eigen::Matrix3f Omega2 = Omega * Omega;
+  double theta2 = theta * theta;
+  double theta3 = theta2 * theta;
+
+  // Rodrigues' formula for Jacobian
+  return Eigen::Matrix3f::Identity() + ((1 - cos(theta)) / theta2) * Omega +
+         ((theta - sin(theta)) / theta3) * Omega2;
+}
+
+__host__ __device__ Eigen::Matrix4f transformationMatrixFromVelocity(
+  const Eigen::Vector3f & linear_velocity, const Eigen::Vector3f & angular_velocity, double dt)
+{
+  Eigen::Matrix3f R = Eigen::AngleAxisf(angular_velocity.norm() * dt, angular_velocity.normalized())
+                        .toRotationMatrix();
+  Eigen::Matrix3f J = leftJacobianSO3(angular_velocity * dt);
+
+  Eigen::Vector3f translation = J * (linear_velocity * dt);
+
+  Eigen::Matrix4f transformation = Eigen::Matrix4f::Identity();
+  transformation.block<3, 3>(0, 0) = R;
+  transformation.block<3, 1>(0, 3) = translation;
+
+  return transformation;
+}
+
+__global__ void undistort2dKernel(
+  InputPointType * input_points, int num_points, TwistStruct2D * twist_structs, int num_twists)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_points) {
+    InputPointType & point = input_points[idx];
+
+    // The twist must always be newer than the point ! (or it was the last twist)
+    int twist_index = 0;
+    while (twist_index < num_twists && twist_structs[twist_index].stamp_nsec < point.time_stamp) {
+      twist_index++;
+    }
+
+    twist_index = min(twist_index, num_twists - 1);
+
+    TwistStruct2D twist = twist_structs[twist_index];
+    float x = twist.cum_x;
+    float y = twist.cum_y;
+    float theta = twist.cum_theta;
+
+    double dt_nsec =
+      point.time_stamp > twist.last_stamp_nsec ? point.time_stamp - twist.last_stamp_nsec : 0;
+    double dt = 1e-9 * (dt_nsec);
+
+    theta += twist.v_theta * dt;
+    float d = twist.v_x * dt;
+    x += d * cos(theta);
+    y += d * sin(theta);
+
+    float distorted_x = point.x;
+    float distorted_y = point.y;
+
+    point.x = distorted_x * cos(theta) - distorted_y * sin(theta) + x;
+    point.y = distorted_x * sin(theta) + distorted_y * cos(theta) + y;
+  }
+}
+
+__global__ void undistort3dKernel(
+  InputPointType * input_points, int num_points, TwistStruct3D * twist_structs, int num_twists)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_points) {
+    InputPointType & point = input_points[idx];
+
+    // The twist must always be newer than the point ! (or it was the last twist)
+    int twist_index = 0;
+    while (twist_index < num_twists && twist_structs[twist_index].stamp_nsec < point.time_stamp) {
+      twist_index++;
+    }
+
+    twist_index = min(twist_index, num_twists - 1);
+
+    TwistStruct3D twist = twist_structs[twist_index];
+    Eigen::Map<Eigen::Matrix4f> cum_transform_buffer_map(twist.cum_transform_buffer);
+    Eigen::Map<Eigen::Vector3f> v_map(twist.v);
+    Eigen::Map<Eigen::Vector3f> w_map(twist.w);
+
+    double dt_nsec =
+      point.time_stamp > twist.last_stamp_nsec ? point.time_stamp - twist.last_stamp_nsec : 0;
+    double dt = 1e-9 * (dt_nsec);
+
+    Eigen::Matrix4f transform =
+      cum_transform_buffer_map * transformationMatrixFromVelocity(v_map, w_map, dt);
+    Eigen::Vector3f p(point.x, point.y, point.z);
+    p = transform.block<3, 3>(0, 0) * p + transform.block<3, 1>(0, 3);
+
+    point.x = p.x();
+    point.y = p.y();
+    point.z = p.z();
+  }
+}
+
+void undistort2dLaunch(
+  InputPointType * input_points, int num_points, TwistStruct2D * twist_structs, int num_twists,
+  int threads_per_block, int blocks_per_grid, cudaStream_t & stream)
+{
+  undistort2dKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+    input_points, num_points, twist_structs, num_twists);
+}
+
+void undistort3dLaunch(
+  InputPointType * input_points, int num_points, TwistStruct3D * twist_structs, int num_twists,
+  int threads_per_block, int blocks_per_grid, cudaStream_t & stream)
+{
+  undistort3dKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+    input_points, num_points, twist_structs, num_twists);
+}
+
+void setupTwist2DStructs(
+  const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
+  const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue,
+  const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec,
+  thrust::device_vector<TwistStruct2D> & device_twist_2d_structs, cudaStream_t & stream)
+{
+  std::vector<TwistStruct2D> host_twist_2d_structs;
+  host_twist_2d_structs.reserve(twist_queue.size() + angular_velocity_queue.size());
+
+  // Twist preprocessing
+  float cum_x = 0;
+  float cum_y = 0;
+  float cum_theta = 0;
+  // All time stamps from now on are in nsec from the "beginning of the pointcloud"
+  std::uint32_t last_stamp_nsec = first_point_rel_stamp_nsec;
+
+  std::size_t twist_index = 0;
+  std::size_t angular_velocity_index = 0;
+
+  for (; twist_index < twist_queue.size() ||
+         angular_velocity_index < angular_velocity_queue.size();) {
+    std::uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec;
+    float v_x, v_theta;
+
+    if (twist_index < twist_queue.size()) {
+      input_twist_global_stamp_nsec =
+        1'000'000'000 * static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.sec) +
+        static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.nanosec);
+      v_x = twist_queue[twist_index].twist.twist.linear.x;
+    } else {
+      input_twist_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max();
+      v_x = 0.0;
+    }
+
+    if (angular_velocity_index < angular_velocity_queue.size()) {
+      angular_velocity_global_stamp_nsec =
+        1'000'000'000 * static_cast<std::uint64_t>(
+                          angular_velocity_queue[angular_velocity_index].header.stamp.sec) +
+        static_cast<std::uint64_t>(
+          angular_velocity_queue[angular_velocity_index].header.stamp.nanosec);
+      v_theta = angular_velocity_queue[angular_velocity_index].vector.z;
+    } else {
+      angular_velocity_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max();
+      v_theta = 0.0;
+    }
+
+    if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) {
+      twist_stamp = input_twist_global_stamp_nsec;
+      twist_index++;
+    } else if (input_twist_global_stamp_nsec > angular_velocity_global_stamp_nsec) {
+      twist_stamp = angular_velocity_global_stamp_nsec;
+      angular_velocity_index++;
+    } else {
+      twist_index++;
+      angular_velocity_index++;
+    }
+
+    TwistStruct2D twist;
+    twist.cum_x = cum_x;
+    twist.cum_y = cum_y;
+    twist.cum_theta = cum_theta;
+
+    std::uint64_t twist_global_stamp_nsec = twist_stamp;
+    assert(twist_global_stamp_nsec > pointcloud_stamp_nsec);  // by construction
+    std::uint32_t twist_from_pointcloud_start_nsec =
+      twist_global_stamp_nsec - pointcloud_stamp_nsec;
+
+    twist.stamp_nsec = twist_from_pointcloud_start_nsec;
+    twist.v_x = v_x;
+    twist.v_theta = v_theta;
+    twist.last_stamp_nsec = last_stamp_nsec;
+    host_twist_2d_structs.push_back(twist);
+
+    double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec);
+    last_stamp_nsec = twist.stamp_nsec;
+    cum_theta += v_theta * dt_seconds;
+    float d = twist.v_x * dt_seconds;
+    cum_x += d * cos(cum_theta);
+    cum_y += d * sin(cum_theta);
+  }
+
+  // Copy to device
+  device_twist_2d_structs.resize(host_twist_2d_structs.size());
+  cudaMemcpyAsync(
+    thrust::raw_pointer_cast(device_twist_2d_structs.data()), host_twist_2d_structs.data(),
+    host_twist_2d_structs.size() * sizeof(TwistStruct2D), cudaMemcpyHostToDevice, stream);
+}
+
+void setupTwist3DStructs(
+  const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue,
+  const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue,
+  const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec,
+  thrust::device_vector<TwistStruct3D> & device_twist_3d_structs, cudaStream_t & stream)
+{
+  std::vector<TwistStruct3D> host_twist_3d_structs;
+  host_twist_3d_structs.reserve(twist_queue.size() + angular_velocity_queue.size());
+
+  Eigen::Matrix4f cum_transform = Eigen::Matrix4f::Identity();
+  Eigen::Vector3f v = Eigen::Vector3f::Zero();
+  Eigen::Vector3f w = Eigen::Vector3f::Zero();
+
+  // All time stamps from now on are in nsec from the "beginning of the pointcloud"
+  std::uint32_t last_stamp_nsec = first_point_rel_stamp_nsec;
+
+  std::size_t twist_index = 0;
+  std::size_t angular_velocity_index = 0;
+
+  for (; twist_index < twist_queue.size() ||
+         angular_velocity_index < angular_velocity_queue.size();) {
+    std::uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec;
+
+    if (twist_index < twist_queue.size()) {
+      input_twist_global_stamp_nsec =
+        1'000'000'000 * static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.sec) +
+        static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.nanosec);
+      v.x() = twist_queue[twist_index].twist.twist.linear.x;
+      v.y() = twist_queue[twist_index].twist.twist.linear.y;
+      v.z() = twist_queue[twist_index].twist.twist.linear.z;
+    } else {
+      input_twist_global_stamp_nsec = std::numeric_limits<uint64_t>::max();
+      v = Eigen::Vector3f::Zero();
+    }
+
+    if (angular_velocity_index < angular_velocity_queue.size()) {
+      angular_velocity_global_stamp_nsec =
+        1'000'000'000 * static_cast<std::uint64_t>(
+                          angular_velocity_queue[angular_velocity_index].header.stamp.sec) +
+        static_cast<std::uint64_t>(
+          angular_velocity_queue[angular_velocity_index].header.stamp.nanosec);
+      w.x() = angular_velocity_queue[angular_velocity_index].vector.x;
+      w.y() = angular_velocity_queue[angular_velocity_index].vector.y;
+      w.z() = angular_velocity_queue[angular_velocity_index].vector.z;
+    } else {
+      angular_velocity_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max();
+      w = Eigen::Vector3f::Zero();
+    }
+
+    if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) {
+      twist_stamp = input_twist_global_stamp_nsec;
+      twist_index++;
+    } else if (input_twist_global_stamp_nsec > angular_velocity_global_stamp_nsec) {
+      twist_stamp = angular_velocity_global_stamp_nsec;
+      angular_velocity_index++;
+    } else {
+      twist_index++;
+      angular_velocity_index++;
+    }
+
+    TwistStruct3D twist;
+
+    Eigen::Map<Eigen::Matrix4f> cum_transform_buffer_map(twist.cum_transform_buffer);
+    Eigen::Map<Eigen::Vector3f> v_map(twist.v);
+    Eigen::Map<Eigen::Vector3f> w_map(twist.w);
+    cum_transform_buffer_map = cum_transform;
+
+    std::uint64_t twist_global_stamp_nsec = twist_stamp;
+    assert(twist_global_stamp_nsec > pointcloud_stamp_nsec);  // by construction
+    std::uint32_t twist_from_pointcloud_start_nsec =
+      twist_global_stamp_nsec - pointcloud_stamp_nsec;
+
+    twist.stamp_nsec = twist_from_pointcloud_start_nsec;
+    v_map = v;
+    w_map = w;
+    twist.last_stamp_nsec = last_stamp_nsec;
+    host_twist_3d_structs.push_back(twist);
+
+    double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec);
+    last_stamp_nsec = twist.stamp_nsec;
+
+    auto delta_transform = transformationMatrixFromVelocity(v, w, dt_seconds);
+    cum_transform = cum_transform * delta_transform;
+  }
+
+  // Copy to device
+  device_twist_3d_structs.resize(host_twist_3d_structs.size());
+  cudaMemcpyAsync(
+    thrust::raw_pointer_cast(device_twist_3d_structs.data()), host_twist_3d_structs.data(),
+    host_twist_3d_structs.size() * sizeof(TwistStruct3D), cudaMemcpyHostToDevice, stream);
+}
+
+}  // namespace autoware::cuda_pointcloud_preprocessor

From 17799e5d0dac816cfb89cce5bbcd5701b618a172 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Wed, 5 Feb 2025 18:19:32 +0900
Subject: [PATCH 30/38] chore: updated the copyright year

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../autoware/cuda_pointcloud_preprocessor/common_kernels.hpp    | 2 +-
 .../cuda_pointcloud_preprocessor.hpp                            | 2 +-
 .../cuda_pointcloud_preprocessor_node.hpp                       | 2 +-
 .../include/autoware/cuda_pointcloud_preprocessor/memory.hpp    | 2 +-
 .../autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp  | 2 +-
 .../autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp   | 2 +-
 .../autoware/cuda_pointcloud_preprocessor/point_types.hpp       | 2 +-
 .../include/autoware/cuda_pointcloud_preprocessor/types.hpp     | 2 +-
 .../autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp | 2 +-
 .../src/cuda_pointcloud_preprocessor/common_kernels.cu          | 2 +-
 .../cuda_pointcloud_preprocessor.cu                             | 2 +-
 .../cuda_pointcloud_preprocessor_node.cpp                       | 2 +-
 .../src/cuda_pointcloud_preprocessor/organize_kernels.cu        | 2 +-
 .../src/cuda_pointcloud_preprocessor/outlier_kernels.cu         | 2 +-
 .../src/cuda_pointcloud_preprocessor/undistort_kernels.cu       | 2 +-
 15 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/common_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/common_kernels.hpp
index 0e21e933569ee..d450a206e9d59 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/common_kernels.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/common_kernels.hpp
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2025 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
index bf19949ddc795..b68c6c5e6162b 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2025 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
index afc2f6cd74de0..22f248eb70704 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2025 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/memory.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/memory.hpp
index 9482f170be550..9fb24b8a29c3f 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/memory.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/memory.hpp
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2025 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp
index 693a001045d3c..82b8a589cc230 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2025 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp
index 2010127afdeb5..28ec3770d7fb8 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2025 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp
index 50010ac0f2b67..4518c6cd2a908 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp
@@ -1,5 +1,5 @@
 
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2025 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp
index 4cb50c8c28e38..aed91d2598193 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2025 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp
index 4a2f91af12953..29cd073b700ae 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2025 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu
index 64ed3ca8ab72f..27d1e05765f16 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2025 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index aac9f560a78ca..97ccf028cfdfd 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2025 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
index bf6cc1f79890c..2ff9626f4c1a8 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2025 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/organize_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/organize_kernels.cu
index 00ab2ddc1aec5..ea74709cb1b54 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/organize_kernels.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/organize_kernels.cu
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2025 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/outlier_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/outlier_kernels.cu
index 18b116535a32c..8ce0634ed2a41 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/outlier_kernels.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/outlier_kernels.cu
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2025 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu
index 6767b03aebf56..551ae43fd983b 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2025 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

From f10022444e70452b8dbced7990a5351d36d05d71 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Wed, 5 Feb 2025 20:09:49 +0900
Subject: [PATCH 31/38] fix: fixed a bug in the undistortion kernel setup.
 validated it comparing it with the baseline

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../src/cuda_pointcloud_preprocessor/undistort_kernels.cu     | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu
index 551ae43fd983b..e9763e990303a 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu
@@ -170,11 +170,11 @@ void setupTwist2DStructs(
 
   std::size_t twist_index = 0;
   std::size_t angular_velocity_index = 0;
+  float v_x{0.f}, v_theta{0.f};
 
   for (; twist_index < twist_queue.size() ||
          angular_velocity_index < angular_velocity_queue.size();) {
     std::uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec;
-    float v_x, v_theta;
 
     if (twist_index < twist_queue.size()) {
       input_twist_global_stamp_nsec =
@@ -183,7 +183,6 @@ void setupTwist2DStructs(
       v_x = twist_queue[twist_index].twist.twist.linear.x;
     } else {
       input_twist_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max();
-      v_x = 0.0;
     }
 
     if (angular_velocity_index < angular_velocity_queue.size()) {
@@ -195,7 +194,6 @@ void setupTwist2DStructs(
       v_theta = angular_velocity_queue[angular_velocity_index].vector.z;
     } else {
       angular_velocity_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max();
-      v_theta = 0.0;
     }
 
     if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) {

From 6f0fe0c1f1270e1487bcc56f9ff87cdcfd15ac87 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Thu, 13 Feb 2025 16:42:01 +0900
Subject: [PATCH 32/38] chore: removed unused packages

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../autoware_cuda_pointcloud_preprocessor/CMakeLists.txt | 9 ++++++---
 .../autoware_cuda_pointcloud_preprocessor/package.xml    | 6 +++---
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
index 4a4fc505862bb..e52192cd44b3b 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
@@ -3,7 +3,6 @@ project(autoware_cuda_pointcloud_preprocessor)
 
 find_package(ament_cmake_auto REQUIRED)
 find_package(CUDA)
-find_package(PCL REQUIRED)
 
 if(NOT ${CUDA_FOUND})
   message(WARNING "cuda was not found, so the autoware_cuda_pointcloud_preprocessor package will not be built.")
@@ -43,20 +42,24 @@ cuda_add_library(cuda_pointcloud_preprocessor_lib SHARED
 )
 
 target_link_libraries(cuda_pointcloud_preprocessor_lib
-  ${PCL_LIBRARIES}
+  ${autoware_pointcloud_preprocessor_TARGETS}
 )
 
 target_include_directories(cuda_pointcloud_preprocessor_lib SYSTEM PRIVATE
+  ${autoware_pointcloud_preprocessor_INCLUDE_DIRS}
   ${autoware_point_types_INCLUDE_DIRS}
   ${cuda_blackboard_INCLUDE_DIRS}
+  ${diagnostic_msgs_INCLUDE_DIRS}
   ${geometry_msgs_INCLUDE_DIRS}
-  ${nebula_common_INCLUDE_DIRS}
   ${rclcpp_INCLUDE_DIRS}
+  ${rclcpp_components_INCLUDE_DIRS}
   ${rcl_interfaces_INCLUDE_DIRS}
   ${sensor_msgs_INCLUDE_DIRS}
   ${tf2_INCLUDE_DIRS}
+  ${tf2_msgs_INCLUDE_DIRS}
 )
 
+
 # Targets
 ament_auto_add_library(cuda_pointcloud_preprocessor SHARED
   src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/package.xml b/sensing/autoware_cuda_pointcloud_preprocessor/package.xml
index edbd7ff61c874..fecb38c2f48f4 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/package.xml
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/package.xml
@@ -11,15 +11,15 @@
   <buildtool_depend>autoware_cmake</buildtool_depend>
 
   <depend>autoware_point_types</depend>
+  <depend>autoware_pointcloud_preprocessor</depend>
   <depend>autoware_universe_utils</depend>
   <depend>cuda_blackboard</depend>
-  <depend>pcl_conversions</depend>
-  <depend>pcl_ros</depend>
+  <depend>diagnostic_msgs</depend>
   <depend>rclcpp</depend>
   <depend>rclcpp_components</depend>
   <depend>sensor_msgs</depend>
-  <depend>sophus</depend>
   <depend>tf2</depend>
+  <depend>tf2_msgs</depend>
   <depend>tier4_debug_msgs</depend>
 
   <test_depend>ament_clang_format</test_depend>

From 2330cd6f2d8165eb6ec937e49b7b412d67750226 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Thu, 13 Feb 2025 16:56:33 +0900
Subject: [PATCH 33/38] chore: removed mentions of the removed adapter

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../README.md                                 |  7 ++--
 .../docs/cuda-organized-pointcloud-adapter.md | 41 -------------------
 .../docs/cuda-pointcloud-preprocessor.md      |  1 -
 3 files changed, 3 insertions(+), 46 deletions(-)
 delete mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/README.md b/sensing/autoware_cuda_pointcloud_preprocessor/README.md
index 3c87883971e7e..0fa5ab7e7c65a 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/README.md
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/README.md
@@ -10,10 +10,9 @@ To alleviate this issue, this package reimplements most of the pipeline presente
 
 A detailed description of each filter's algorithm is available in the following links.
 
-| Filter Name                       | Description                                                                                                                                  | Detail                                            |
-| --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------- |
-| cuda_organized_pointcloud_adapter | Organizes a pointcloud per ring/channel, so that the memory layout allows parallel processing in cuda                                        | [link](docs/cuda-organized-pointcloud-adapter.md) |
-| cuda_pointcloud_preprocessor      | Implements the cropping, distortion correction, and outlier filtering (ring-based) of the `autoware_pointcloud_preprocessor`'s CPU versions. | [link](docs/cuda-pointcloud-preprocessor.md)      |
+| Filter Name                  | Description                                                                                                                                  | Detail                                       |
+| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------- |
+| cuda_pointcloud_preprocessor | Implements the cropping, distortion correction, and outlier filtering (ring-based) of the `autoware_pointcloud_preprocessor`'s CPU versions. | [link](docs/cuda-pointcloud-preprocessor.md) |
 
 ## (Optional) Future extensions / Unimplemented parts
 
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md
deleted file mode 100644
index 634cdfb3e14f0..0000000000000
--- a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# cuda_organized_pointcloud_adapter
-
-## Purpose
-
-The node `cuda_pointcloud_preprocessor` expects a 2D pointcloud where every row represents a single channel/ring and each row's points are in non decreasing azimuth order.
-
-To utilize the previously mentioned node, this node provides an adapter to convert standard flat pointclouds (`height` equals to 1) to the required 2D tensor.
-
-In addition, this node uses the `cuda_blackboard`, a cuda transport layer that enables a zero-copy mechanism between GPU and GPU memory for both input and output.
-
-## Inner-workings / Algorithms
-
-To create the required 2D tensor, this node iterates the input pointcloud sequentially, filling the output 2D tensor depending on the input point's channel.
-
-The output tensor's size is also estimated in this node, based on the largest `channel` value and the maximum number of points per channel observed so far.
-
-## Inputs / Outputs
-
-### Input
-
-| Name                 | Type                            | Description               |
-| -------------------- | ------------------------------- | ------------------------- |
-| `~/input/pointcloud` | `sensor_msgs::msg::PointCloud2` | Input pointcloud's topic. |
-
-### Output
-
-| Name                       | Type                                             | Description                              |
-| -------------------------- | ------------------------------------------------ | ---------------------------------------- |
-| `~/output/pointcloud`      | `sensor_msgs::msg::PointCloud2`                  | Processed pointcloud's topic             |
-| `~/output/pointcloud/cuda` | `negotiated_interfaces/msg/NegotiatedTopicsInfo` | Processed pointcloud's negotiation topic |
-
-## Parameters
-
-### Core Parameters
-
-{{ json_to_markdown("sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.schema.json") }}
-
-## Assumptions / Known limits
-
-- This algorithm assumes that the input points will be in non-decreasing azimuth order (per ring).
-- This node expects that the input pointcloud is flat (`height` equals to 1) and follows the `autoware::point_types::PointXYZIRC` layout defined in the `autoware_point_types` package.
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
index 4a7e2d070ccb6..2ba24e6a8fb52 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md
@@ -43,4 +43,3 @@ In addition to the individual algorithms previously mentioned, this node uses th
 
 - The CUDA implementations, while following the original CPU ones, will not offer the same numerical results, and small approximations were needed to maximize GPU usage.
 - This node expects that the input pointcloud follows the `autoware::point_types::PointXYZIRCAEDT` layout and the output pointcloud will use the `autoware::point_types::PointXYZIRC` layout defined in the `autoware_point_types` package.
-- The input pointcloud must be a 2D tensor where each row represents a different channel/ring with its points in non-decreasing azimuth order. Invalid points should contain 0-values. The `cuda_organized_pointcloud_adapter` provides such a pointcloud.

From 48b39cd8ec84df413c8ff51344cc254ad32f17a2 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Fri, 14 Feb 2025 09:21:22 +0900
Subject: [PATCH 34/38] chore: fixed missing autoware prefix

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../launch/cuda_pointcloud_preprocessor.launch.xml            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml b/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml
index e8843bd804bf0..a1da651aa76e2 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml
@@ -4,9 +4,9 @@
   <arg name="input/twist" default="/sensing/vehicle_velocity_converter/twist_with_covariance"/>
   <arg name="output/pointcloud" default="/sensing/lidar/top/test"/>
 
-  <arg name="cuda_pointcloud_preprocessor_param_file" default="$(find-pkg-share cuda_pointcloud_preprocessor)/config/cuda_pointcloud_preprocessor.param.yaml"/>
+  <arg name="cuda_pointcloud_preprocessor_param_file" default="$(find-pkg-share autoware_cuda_pointcloud_preprocessor)/config/cuda_pointcloud_preprocessor.param.yaml"/>
 
-  <node pkg="cuda_pointcloud_preprocessor" exec="cuda_pointcloud_preprocessor_node" name="cuda_pointcloud_preprocessor" output="screen">
+  <node pkg="autoware_cuda_pointcloud_preprocessor" exec="cuda_pointcloud_preprocessor_node" name="cuda_pointcloud_preprocessor" output="screen">
     <remap from="~/input/pointcloud" to="$(var input/pointcloud)"/>
     <remap from="~/input/imu" to="$(var input/imu)"/>
     <remap from="~/input/twist" to="$(var input/twist)"/>

From fa36260cf79670d2bc222261597ea19cf59247e6 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Fri, 14 Feb 2025 09:25:16 +0900
Subject: [PATCH 35/38] fix: missing assignment in else branch

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../src/cuda_pointcloud_preprocessor/undistort_kernels.cu        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu
index e9763e990303a..4dce3345b4807 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu
@@ -203,6 +203,7 @@ void setupTwist2DStructs(
       twist_stamp = angular_velocity_global_stamp_nsec;
       angular_velocity_index++;
     } else {
+      twist_stamp = input_twist_global_stamp_nsec;
       twist_index++;
       angular_velocity_index++;
     }

From 2b9f654a15aa20dec801b8ab258b6496a2ca6015 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Fri, 14 Feb 2025 09:29:42 +0900
Subject: [PATCH 36/38] chore: added cuda/nvcc debug flags on debug builds

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
index e52192cd44b3b..0e174ee18bbd2 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt
@@ -7,6 +7,9 @@ find_package(CUDA)
 if(NOT ${CUDA_FOUND})
   message(WARNING "cuda was not found, so the autoware_cuda_pointcloud_preprocessor package will not be built.")
   return()
+elseif(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-g -G")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-g -G")
 endif()
 
 ament_auto_find_build_dependencies()

From 186fff9d1dac57adf966dc264f964e96a118126c Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Fri, 14 Feb 2025 10:03:12 +0900
Subject: [PATCH 37/38] chore: refactored parameters for the undistortion
 settings

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../cuda_pointcloud_preprocessor.hpp          |  6 ++++--
 .../cuda_pointcloud_preprocessor.cu           | 20 +++++++++++++------
 .../cuda_pointcloud_preprocessor_node.cpp     |  6 +++++-
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
index b68c6c5e6162b..d14fc2ec1eb68 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp
@@ -38,12 +38,14 @@ namespace autoware::cuda_pointcloud_preprocessor
 class CudaPointcloudPreprocessor
 {
 public:
+  enum class UndistortionType { Invalid, Undistortion2D, Undistortion3D };
+
   CudaPointcloudPreprocessor();
   ~CudaPointcloudPreprocessor() = default;
 
   void setCropBoxParameters(const std::vector<CropBoxParameters> & crop_box_parameters);
   void setRingOutlierFilterParameters(const RingOutlierFilterParameters & ring_outlier_parameters);
-  void set3DUndistortion(bool value);
+  void setUndistortionType(const UndistortionType & undistortion_type);
 
   void preallocateOutput();
 
@@ -60,7 +62,7 @@ class CudaPointcloudPreprocessor
   CropBoxParameters self_crop_box_parameters_{};
   CropBoxParameters mirror_crop_box_parameters_{};
   RingOutlierFilterParameters ring_outlier_parameters_{};
-  bool use_3d_undistortion_{false};
+  UndistortionType undistortion_type_{UndistortionType::Invalid};
 
   int num_rings_{};
   int max_points_per_ring_{};
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
index 97ccf028cfdfd..f9a070ee40173 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu
@@ -182,9 +182,13 @@ void CudaPointcloudPreprocessor::setRingOutlierFilterParameters(
   ring_outlier_parameters_ = ring_outlier_parameters;
 }
 
-void CudaPointcloudPreprocessor::set3DUndistortion(bool use_3d_undistortion)
+void CudaPointcloudPreprocessor::setUndistortionType(const UndistortionType & undistortion_type)
 {
-  use_3d_undistortion_ = use_3d_undistortion;
+  if (undistortion_type == UndistortionType::Invalid) {
+    throw std::runtime_error("Invalid undistortion type");
+  }
+
+  undistortion_type_ = undistortion_type;
 }
 
 void CudaPointcloudPreprocessor::preallocateOutput()
@@ -351,14 +355,16 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr
   std::uint64_t pointcloud_stamp_nsec = 1'000'000'000 * input_pointcloud_msg_ptr->header.stamp.sec +
                                         input_pointcloud_msg_ptr->header.stamp.nanosec;
 
-  if (use_3d_undistortion_) {
+  if (undistortion_type_ == UndistortionType::Undistortion3D) {
     setupTwist3DStructs(
       twist_queue, angular_velocity_queue, pointcloud_stamp_nsec, first_point_rel_stamp_nsec,
       device_twist_3d_structs_, stream_);
-  } else {
+  } else if (undistortion_type_ == UndistortionType::Undistortion2D) {
     setupTwist2DStructs(
       twist_queue, angular_velocity_queue, pointcloud_stamp_nsec, first_point_rel_stamp_nsec,
       device_twist_2d_structs_, stream_);
+  } else {
+    throw std::runtime_error("Invalid undistortion type");
   }
 
   // Obtain raw pointers for the kernels
@@ -389,11 +395,13 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr
     thrust::fill(thrust::device, device_crop_mask, device_crop_mask + num_organized_points_, 1);
   }
 
-  if (use_3d_undistortion_ && device_twist_3d_structs_.size() > 0) {
+  if (
+    undistortion_type_ == UndistortionType::Undistortion3D && device_twist_3d_structs_.size() > 0) {
     undistort3dLaunch(
       device_transformed_points, num_organized_points_, device_twist_3d_structs,
       device_twist_3d_structs_.size(), threads_per_block_, blocks_per_grid, stream_);
-  } else if (!use_3d_undistortion_ && device_twist_2d_structs_.size() > 0) {
+  } else if (
+    undistortion_type_ == UndistortionType::Undistortion2D && device_twist_2d_structs_.size() > 0) {
     undistort2dLaunch(
       device_transformed_points, num_organized_points_, device_twist_2d_structs,
       device_twist_2d_structs_.size(), threads_per_block_, blocks_per_grid, stream_);
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
index 2ff9626f4c1a8..6c62d9019919f 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp
@@ -112,10 +112,14 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode(
       sub_options);
   }
 
+  CudaPointcloudPreprocessor::UndistortionType undistortion_type =
+    use_3d_undistortion ? CudaPointcloudPreprocessor::UndistortionType::Undistortion3D
+                        : CudaPointcloudPreprocessor::UndistortionType::Undistortion2D;
+
   cuda_pointcloud_preprocessor_ = std::make_unique<CudaPointcloudPreprocessor>();
   cuda_pointcloud_preprocessor_->setRingOutlierFilterParameters(ring_outlier_filter_parameters);
   cuda_pointcloud_preprocessor_->setCropBoxParameters(crop_box_parameters);
-  cuda_pointcloud_preprocessor_->set3DUndistortion(use_3d_undistortion);
+  cuda_pointcloud_preprocessor_->setUndistortionType(undistortion_type);
 
   // initialize debug tool
   {

From d2505ee9d7ccda6bc014747b7b7d1ecc2bfeec8b Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Fri, 14 Feb 2025 10:03:42 +0900
Subject: [PATCH 38/38] chore: removed unused headers

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../cuda_pointcloud_preprocessor_node.hpp                        | 1 -
 .../autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp   | 1 -
 .../autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp    | 1 -
 3 files changed, 3 deletions(-)

diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
index 22f248eb70704..f8b77217319a6 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp
@@ -38,7 +38,6 @@
 
 #include <deque>
 #include <memory>
-#include <vector>
 
 #define CHECK_OFFSET(structure1, structure2, field)             \
   static_assert(                                                \
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp
index 82b8a589cc230..0fe52bfe76409 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp
@@ -16,7 +16,6 @@
 #define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__ORGANIZE_KERNELS_HPP_
 
 #include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
-#include "autoware/cuda_pointcloud_preprocessor/types.hpp"
 
 #include <cuda_runtime.h>
 
diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp
index 28ec3770d7fb8..23d46458edce3 100644
--- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp
+++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp
@@ -16,7 +16,6 @@
 #define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__OUTLIER_KERNELS_HPP_
 
 #include "autoware/cuda_pointcloud_preprocessor/point_types.hpp"
-#include "autoware/cuda_pointcloud_preprocessor/types.hpp"
 
 #include <cuda_runtime.h>