From af2e8843a2395f20edcee6cb09047be62628c3eb Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa Date: Mon, 25 Nov 2024 17:08:54 +0900 Subject: [PATCH 01/38] feat: moved the cuda pointcloud preprocessor and organized from a personal repository Signed-off-by: Kenzo Lobos-Tsunekawa --- .../CMakeLists.txt | 100 +++ .../README.md | 20 + .../cuda_pointcloud_preprocessor.param.yaml | 22 + .../docs/cuda-organized-pointcloud-adapter.md | 41 ++ .../docs/cuda-pointcloud-preprocessor.md | 46 ++ ...cuda_organized_pointcloud_adapter_node.hpp | 77 ++ .../cuda_pointcloud_preprocessor.hpp | 154 ++++ .../cuda_pointcloud_preprocessor_node.hpp | 114 +++ .../point_types.hpp | 50 ++ .../cuda_pointcloud_preprocessor.launch.xml | 16 + .../package.xml | 32 + .../cuda_pointcloud_preprocessor.schema.json | 140 ++++ ...cuda_organized_pointcloud_adapter_node.cpp | 203 +++++ .../cuda_pointcloud_preprocessor.cu | 692 ++++++++++++++++++ .../cuda_pointcloud_preprocessor_node.cpp | 253 +++++++ 15 files changed, 1960 insertions(+) create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/README.md create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/package.xml create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt new file mode 100644 index 0000000000000..06b1f5d67540f --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt @@ -0,0 +1,100 @@ +cmake_minimum_required(VERSION 3.14) +project(autoware_cuda_pointcloud_preprocessor) + +find_package(ament_cmake_auto REQUIRED) +find_package(CUDA) +find_package(PCL REQUIRED) + +if(NOT ${CUDA_FOUND}) + message(WARNING "cuda was not found, so the autoware_cuda_pointcloud_preprocessor package will not be built.") + return() +endif() + +ament_auto_find_build_dependencies() + +# Default to C++17 +if (NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif () + +if (CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_options(-Wall -Wextra -Wpedantic -Wunused-function) +endif () + +if(BUILD_TESTING) + find_package(ament_lint_auto REQUIRED) + ament_lint_auto_find_test_dependencies() +endif() + +include_directories( + include + SYSTEM + ${CUDA_INCLUDE_DIRS} +) + +list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 20012") + +cuda_add_library(cuda_pointcloud_preprocessor_lib SHARED + src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +) + +target_link_libraries(cuda_pointcloud_preprocessor_lib + ${PCL_LIBRARIES} +) + +target_include_directories(cuda_pointcloud_preprocessor_lib SYSTEM PRIVATE + ${autoware_point_types_INCLUDE_DIRS} + ${cuda_blackboard_INCLUDE_DIRS} + ${geometry_msgs_INCLUDE_DIRS} + ${nebula_common_INCLUDE_DIRS} + ${rclcpp_INCLUDE_DIRS} + ${rcl_interfaces_INCLUDE_DIRS} + ${sensor_msgs_INCLUDE_DIRS} + ${tf2_INCLUDE_DIRS} +) + +# Targets +ament_auto_add_library(cuda_organized_pointcloud_adapter SHARED + src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp +) + +target_link_libraries(cuda_organized_pointcloud_adapter + ${CUDA_LIBRARIES} +) + +ament_auto_add_library(cuda_pointcloud_preprocessor SHARED + src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp +) + +target_link_libraries(cuda_pointcloud_preprocessor + ${CUDA_LIBRARIES} + cuda_pointcloud_preprocessor_lib +) + +rclcpp_components_register_node(cuda_organized_pointcloud_adapter + PLUGIN "autoware::cuda_organized_pointcloud_adapter::CudaOrganizedPointcloudAdapterNode" + EXECUTABLE cuda_organized_pointcloud_adapter_node +) + +rclcpp_components_register_node(cuda_pointcloud_preprocessor + PLUGIN "autoware::cuda_pointcloud_preprocessor::CudaPointcloudPreprocessorNode" + EXECUTABLE cuda_pointcloud_preprocessor_node +) + +install(DIRECTORY launch config + DESTINATION share/${PROJECT_NAME} +) + +ament_auto_package() + +# Set ROS_DISTRO macros +set(ROS_DISTRO $ENV{ROS_DISTRO}) +if(${ROS_DISTRO} STREQUAL "rolling") + add_compile_definitions(ROS_DISTRO_ROLLING) +elseif(${ROS_DISTRO} STREQUAL "foxy") + add_compile_definitions(ROS_DISTRO_FOXY) +elseif(${ROS_DISTRO} STREQUAL "galactic") + add_compile_definitions(ROS_DISTRO_GALACTIC) +elseif(${ROS_DISTRO} STREQUAL "humble") + add_compile_definitions(ROS_DISTRO_HUMBLE) +endif() diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/README.md b/sensing/autoware_cuda_pointcloud_preprocessor/README.md new file mode 100644 index 0000000000000..d15082b69a661 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/README.md @@ -0,0 +1,20 @@ +# autoware_cuda_pointcloud_preprocessor + +## Purpose + +The pointcloud preprocessing implemented in `autoware_pointcloud_preprocessor` has been thoroughly tested in autoware. However, the latency it introduces does not scale well with modern LiDAR devices due to the high number of points they introduce. + +To alleviate this issue, this package reimplements most of the pipeline presented in `autoware_pointcloud_preprocessor` leveraging the use of GPGPUs. In particular, this package makes use of CUDA to provide accelerated versions of the already establishes implementations, while also maintaining compatibility with normal ROS nodes/topics. + +## Inner-workings / Algorithms + +A detailed description of each filter's algorithm is available in the following links. + +| Filter Name | Description | Detail | +| --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------- | +| cuda_organized_pointcloud_adapter | Organizes a pointcloud per ring/channel, so that the memory layout allows parallel processing in cuda | [link](docs/cuda-organized-pointcloud-adapter.md) | +| cuda_pointcloud_preprocessor | Implements the cropping, distortion correction, and outlier filtering (ring-based) of the `autoware_pointcloud_preprocessor`'s cpu versions. | [link](docs/cuda-pointcloud-preprocessor.md) | + +## (Optional) Future extensions / Unimplemented parts + +The subsample filters implemented in `autoware_pointcloud_preprocessor` will have similar counterparts in this package. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml b/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml new file mode 100644 index 0000000000000..22358cfadf2a7 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml @@ -0,0 +1,22 @@ +/**: + ros__parameters: + + base_frame: base_link + use_imu: true + use_3d_distortion_correction: false + distance_ratio: 1.03 + object_length_threshold: 0.1 + num_points_threshold: 4 + + self_crop.min_x: 1.0 + self_crop.min_y: 1.0 + self_crop.min_z: 1.0 + self_crop.max_x: -1.0 + self_crop.max_y: -1.0 + self_crop.max_z: -1.0 + mirror_crop.min_x: 1.0 + mirror_crop.min_y: 1.0 + mirror_crop.min_z: 1.0 + mirror_crop.max_x: -1.0 + mirror_crop.max_y: -1.0 + mirror_crop.max_z: -1.0 diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md new file mode 100644 index 0000000000000..634cdfb3e14f0 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md @@ -0,0 +1,41 @@ +# cuda_organized_pointcloud_adapter + +## Purpose + +The node `cuda_pointcloud_preprocessor` expects a 2D pointcloud where every row represents a single channel/ring and each row's points are in non decreasing azimuth order. + +To utilize the previously mentioned node, this node provides an adapter to convert standard flat pointclouds (`height` equals to 1) to the required 2D tensor. + +In addition, this node uses the `cuda_blackboard`, a cuda transport layer that enables a zero-copy mechanism between GPU and GPU memory for both input and output. + +## Inner-workings / Algorithms + +To create the required 2D tensor, this node iterates the input pointcloud sequentially, filling the output 2D tensor depending on the input point's channel. + +The output tensor's size is also estimated in this node, based on the largest `channel` value and the maximum number of points per channel observed so far. + +## Inputs / Outputs + +### Input + +| Name | Type | Description | +| -------------------- | ------------------------------- | ------------------------- | +| `~/input/pointcloud` | `sensor_msgs::msg::PointCloud2` | Input pointcloud's topic. | + +### Output + +| Name | Type | Description | +| -------------------------- | ------------------------------------------------ | ---------------------------------------- | +| `~/output/pointcloud` | `sensor_msgs::msg::PointCloud2` | Processed pointcloud's topic | +| `~/output/pointcloud/cuda` | `negotiated_interfaces/msg/NegotiatedTopicsInfo` | Processed pointcloud's negotiation topic | + +## Parameters + +### Core Parameters + +{{ json_to_markdown("sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.schema.json") }} + +## Assumptions / Known limits + +- This algorithm assumes that the input points will be in non-decreasing azimuth order (per ring). +- This node expects that the input pointcloud is flat (`height` equals to 1) and follows the `autoware::point_types::PointXYZIRC` layout defined in the `autoware_point_types` package. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md new file mode 100644 index 0000000000000..459f15569d7ca --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md @@ -0,0 +1,46 @@ +# cuda_pointcloud_preprocessor + +## Purpose + +This node implements all standard pointcloud preprocessing algorithms applied to a single LiDAR's pointcloud in CUDA. +In particular, this node implements: + +- crop boxing (ego-vehicle and ego-vehicle's mirrors) +- distortion correction +- ring-based outlier filtering + +## Inner-workings / Algorithms + +As this node reimplements the functionalities of the CPU-version algorithms, please have a look at the documentations of [crop-box](/sensing/autoware_pointcloud_preprocessor/docs/crop-box-filter.md), [distortion correction](/sensing/autoware_pointcloud_preprocessor/docs/distortion-corrector.md), and [ring-based outlier filter](/sensing/autoware_pointcloud_preprocessor/docs/ring-outlier-filter.md) for more information about these algorithms. + +In addition to the individual algorithms previously mentioned, this node uses the `cuda_blackboard`, a cuda transport layer that enables a zero-copy mechanism between GPU and GPU memory for both input and output. + +## Inputs / Outputs + +### Input + +| Name | Type | Description | +| ------------------------- | ------------------------------------------------ | ----------------------------------------- | +| `~/input/pointcloud` | `sensor_msgs::msg::PointCloud2` | Input pointcloud's topic. | +| `~/input/pointcloud/cuda` | `negotiated_interfaces/msg/NegotiatedTopicsInfo` | Input pointcloud's type negotiation topic | +| `~/input/twist` | `geometry_msgs::msg::TwistWithCovarianceStamped` | Topic of the twist information. | +| `~/input/imu` | `sensor_msgs::msg::Imu` | Topic of the IMU data. | + +### Output + +| Name | Type | Description | +| -------------------------- | ------------------------------------------------ | ---------------------------------------- | +| `~/output/pointcloud` | `sensor_msgs::msg::PointCloud2` | Processed pointcloud's topic | +| `~/output/pointcloud/cuda` | `negotiated_interfaces/msg/NegotiatedTopicsInfo` | Processed pointcloud's negotiation topic | + +## Parameters + +### Core Parameters + +{{ json_to_markdown("sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.schema.json") }} + +## Assumptions / Known limits + +- The CUDA implementations, while following the original CPU ones, will not offer the same numerical results, and small approximations were needed to maximize the GPU use. +- This node expects that the input pointcloud follows the `autoware::point_types::PointXYZIRCAEDT` layout and the output pointcloud will use the `autoware::point_types::PointXYZIRC` layout defined in the `autoware_point_types` package. +- The input pointcloud must be a 2D tensor where each row represents a different channel/ring with its points in non-decreasing azimuth order. Invalid points should contain 0-values. The `cuda_organized_pointcloud_adapter` provides such a pointcloud. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp new file mode 100644 index 0000000000000..8930a1f87e63b --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp @@ -0,0 +1,77 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//    http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_ +#define AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include + +namespace autoware::cuda_organized_pointcloud_adapter +{ + +class CudaOrganizedPointcloudAdapterNode : public rclcpp::Node +{ +public: + explicit CudaOrganizedPointcloudAdapterNode(const rclcpp::NodeOptions & node_options); + ~CudaOrganizedPointcloudAdapterNode() = default; + +private: + void estimatePointcloudRingInfo( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr); + + bool orderPointcloud( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr); + + // Callback + void pointcloudCallback( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr); + + // Subscriber + rclcpp::Subscription::SharedPtr pointcloud_sub_{}; + + // CUDA pub + std::unique_ptr> pub_; + + std::size_t num_rings_{0}; + std::size_t max_points_per_ring_{0}; + + std::vector next_ring_index_; + std::vector buffer_; + // autoware::point_types::PointXYZIRCAEDT * device_buffer_; + cuda_blackboard::CudaUniquePtr device_buffer_; + + std::unique_ptr> stop_watch_ptr_; + std::unique_ptr debug_publisher_; +}; + +} // namespace autoware::cuda_organized_pointcloud_adapter + +#endif // AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp new file mode 100644 index 0000000000000..b4ae4eecc304e --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp @@ -0,0 +1,154 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//    http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_HPP_ +#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_HPP_ + +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" + +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include + +namespace autoware::cuda_pointcloud_preprocessor +{ + +struct TwistStruct2D +{ + float cum_x; + float cum_y; + float cum_theta; + float cum_cos_theta; + float cum_sin_theta; + uint32_t last_stamp_nsec; // relative to the start of the pointcloud + uint32_t stamp_nsec; // relative to the start of the pointcloud + float vx; + float vtheta; +}; + +struct TwistStruct3D +{ + // Eigen::Affine3f cum_transform; + float cum_transform_buffer[16]; + /* float cum_x; + float cum_y; + float cum_theta; */ + /* float cum_cos_theta; + float cum_sin_theta; */ + uint32_t last_stamp_nsec; // relative to the start of the pointcloud + uint32_t stamp_nsec; // relative to the start of the pointcloud + /* float vx; + float vtheta; */ + float v[3]; + float w[3]; +}; + +struct TransformStruct +{ + float x; + float y; + float z; + float m11; + float m12; + float m13; + float m21; + float m22; + float m23; + float m31; + float m32; + float m33; +}; + +struct CropBoxParameters +{ + float min_x{std::nanf("")}; + float max_x{std::nanf("")}; + float min_y{std::nanf("")}; + float max_y{std::nanf("")}; + float min_z{std::nanf("")}; + float max_z{std::nanf("")}; +}; + +struct RingOutlierFilterParameters +{ + float distance_ratio{std::nanf("")}; + float object_length_threshold{std::nanf("")}; + std::size_t num_points_threshold{0}; +}; + +class CudaPointcloudPreprocessor +{ +public: + CudaPointcloudPreprocessor(); + ~CudaPointcloudPreprocessor() = default; + + void setCropBoxParameters( + const CropBoxParameters & self_crop_box_parameters, + const CropBoxParameters & mirror_crop_box_parameters); + void setRingOutlierFilterParameters(const RingOutlierFilterParameters & ring_outlier_parameters); + void set3DUndistortion(bool value); + + void preallocateOutput(); + + std::unique_ptr process( + const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, + const geometry_msgs::msg::TransformStamped & transform_msg, + const std::deque & twist_queue, + const std::deque & angular_velocity_queue); + +private: + void setupTwist2DStructs( + const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, + const std::deque & twist_queue, + const std::deque & angular_velocity_queue); + + void setupTwist3DStructs( + const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, + const std::deque & twist_queue, + const std::deque & angular_velocity_queue); + + CropBoxParameters self_crop_box_parameters_{}; + CropBoxParameters mirror_crop_box_parameters_{}; + RingOutlierFilterParameters ring_outlier_parameters_{}; + bool use_3d_undistortion_{false}; + + int max_rings_{}; + int max_points_per_ring_{}; + + std::vector point_fields_{}; + std::unique_ptr output_pointcloud_ptr_{}; + + thrust::device_vector device_transformed_points_{}; + thrust::device_vector device_output_points_{}; + thrust::device_vector device_self_crop_mask_{}; + thrust::device_vector device_mirror_crop_mask_{}; + thrust::device_vector device_ring_outlier_mask_{}; + thrust::device_vector device_indices_{}; + thrust::device_vector device_twist_2d_structs_{}; + thrust::device_vector device_twist_3d_structs_{}; +}; + +} // namespace autoware::cuda_pointcloud_preprocessor + +#endif // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_HPP_ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp new file mode 100644 index 0000000000000..5fdf4258eebcb --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp @@ -0,0 +1,114 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//    http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_NODE_HPP_ +#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_NODE_HPP_ + +#include "autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp" +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#define CHECK_OFFSET(structure1, structure2, field) \ + static_assert( \ + offsetof(structure1, field) == offsetof(structure2, field), \ + "Offset of " #field " in " #structure1 " does not match expected offset.") + +namespace autoware::cuda_pointcloud_preprocessor +{ + +static_assert(sizeof(InputPointType) == sizeof(autoware::point_types::PointXYZIRCAEDT)); +static_assert(sizeof(OutputPointType) == sizeof(autoware::point_types::PointXYZIRC)); + +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, x); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, y); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, z); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, intensity); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, return_type); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, channel); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, azimuth); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, elevation); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, distance); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, time_stamp); + +CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, x); +CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, y); +CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, z); +CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, intensity); +CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, return_type); +CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, channel); + +class CudaPointcloudPreprocessorNode : public rclcpp::Node +{ +public: + explicit CudaPointcloudPreprocessorNode(const rclcpp::NodeOptions & node_options); + ~CudaPointcloudPreprocessorNode() = default; + +private: + bool getTransform( + const std::string & target_frame, const std::string & source_frame, + tf2::Transform * tf2_transform_ptr); + + // Callback + void pointcloudCallback(const std::shared_ptr cuda_msg); + void twistCallback( + const geometry_msgs::msg::TwistWithCovarianceStamped::ConstSharedPtr pointcloud_msg); + void imuCallback(const sensor_msgs::msg::Imu::ConstSharedPtr imu_msg); + + tf2_ros::Buffer tf2_buffer_; + tf2_ros::TransformListener tf2_listener_; + + std::string base_frame_; + bool imu_tranform_valid_{false}; + std::deque twist_queue_; + std::deque angular_velocity_queue_; + + // Subscriber + rclcpp::Subscription::SharedPtr twist_sub_{}; + rclcpp::Subscription::SharedPtr imu_sub_{}; + + // CUDA pub & sub + std::unique_ptr> pub_; + std::unique_ptr> sub_; + + std::unique_ptr cuda_pointcloud_preprocessor_; + + std::unique_ptr> stop_watch_ptr_; + std::unique_ptr debug_publisher_; +}; + +} // namespace autoware::cuda_pointcloud_preprocessor + +#endif // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_NODE_HPP_ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp new file mode 100644 index 0000000000000..50010ac0f2b67 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp @@ -0,0 +1,50 @@ + +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//    http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__POINT_TYPES_HPP_ +#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__POINT_TYPES_HPP_ + +#include + +namespace autoware::cuda_pointcloud_preprocessor +{ + +// Note: We can not use PCL nor uniform initialization here because of thrust +struct OutputPointType +{ + float x; + float y; + float z; + std::uint8_t intensity; + std::uint8_t return_type; + std::uint16_t channel; +}; + +struct InputPointType +{ + float x; + float y; + float z; + std::uint8_t intensity; + std::uint8_t return_type; + std::uint16_t channel; + float azimuth; + float elevation; + float distance; + std::uint32_t time_stamp; +}; + +} // namespace autoware::cuda_pointcloud_preprocessor + +#endif // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__POINT_TYPES_HPP_ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml b/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml new file mode 100644 index 0000000000000..e8843bd804bf0 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/package.xml b/sensing/autoware_cuda_pointcloud_preprocessor/package.xml new file mode 100644 index 0000000000000..edbd7ff61c874 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/package.xml @@ -0,0 +1,32 @@ + + + autoware_cuda_pointcloud_preprocessor + 0.1.0 + autoware_cuda_pointcloud_preprocessor + Kenzo Lobos-Tsunekawa + Kenzo Lobos-Tsunekawa + Apache License 2.0 + + ament_cmake_auto + autoware_cmake + + autoware_point_types + autoware_universe_utils + cuda_blackboard + pcl_conversions + pcl_ros + rclcpp + rclcpp_components + sensor_msgs + sophus + tf2 + tier4_debug_msgs + + ament_clang_format + ament_lint_auto + ament_lint_common + + + ament_cmake + + diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json new file mode 100644 index 0000000000000..196f455ffe75b --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json @@ -0,0 +1,140 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Parameters for the cuda_pointcloud_preprocessor", + "type": "object", + "definitions": { + "crop_box_filter": { + "type": "object", + "properties": { + "base_frame": { + "type": "string", + "description": "The undistortion algorithm is based on a base frame, which must be the same as the twist frame.", + "default": "base_link" + }, + "use_imu": { + "type": "boolean", + "description": "Use IMU angular velocity, otherwise, use twist angular velocity.", + "default": "true" + }, + "use_3d_distortion_correction": { + "type": "boolean", + "description": "Use 3d distortion correction algorithm, otherwise, use 2d distortion correction algorithm.", + "default": "false" + }, + "distance_ratio": { + "type": "number", + "description": "distance_ratio", + "default": "1.03", + "minimum": 0.0 + }, + "object_length_threshold": { + "type": "number", + "description": "object_length_threshold", + "default": "0.1", + "minimum": 0.0 + }, + "num_points_threshold": { + "type": "integer", + "description": "num_points_threshold", + "default": "4", + "minimum": 0 + }, + "self_crop.min_x": { + "type": "number", + "description": "minimum x-coordinate value for self cropping in meters", + "default": "-1.0" + }, + "self_crop.min_y": { + "type": "number", + "description": "minimum y-coordinate value for self cropping in meters", + "default": "-1.0" + }, + "self_crop.min_z": { + "type": "number", + "description": "minimum z-coordinate value for self cropping in meters", + "default": "-1.0" + }, + "self_crop.max_x": { + "type": "number", + "description": "maximum x-coordinate value for self cropping in meters", + "default": "1.0" + }, + "self_crop.max_y": { + "type": "number", + "description": "maximum y-coordinate value for self cropping in meters", + "default": "1.0" + }, + "self_crop.max_z": { + "type": "number", + "description": "maximum z-coordinate value for self cropping in meters", + "default": "1.0" + }, + "mirror_crop.min_x": { + "type": "number", + "description": "minimum x-coordinate value for mirror cropping in meters", + "default": "-1.0" + }, + "mirror_crop.min_y": { + "type": "number", + "description": "minimum y-coordinate value for mirror cropping in meters", + "default": "-1.0" + }, + "mirror_crop.min_z": { + "type": "number", + "description": "minimum z-coordinate value for mirror cropping in meters", + "default": "-1.0" + }, + "mirror_crop.max_x": { + "type": "number", + "description": "maximum x-coordinate value for mirror cropping in meters", + "default": "1.0" + }, + "mirror_crop.max_y": { + "type": "number", + "description": "maximum y-coordinate value for mirror cropping in meters", + "default": "1.0" + }, + "mirror_crop.max_z": { + "type": "number", + "description": "maximum z-coordinate value for mirror cropping in meters", + "default": "1.0" + } + }, + "required": [ + "base_frame", + "use_imu", + "use_3d_distortion_correction", + "distance_ratio", + "object_length_threshold", + "num_points_threshold", + "self_crop.min_x", + "self_crop.min_y", + "self_crop.min_z", + "self_crop.max_x", + "self_crop.max_y", + "self_crop.max_z", + "mirror_crop.min_x", + "mirror_crop.min_y", + "mirror_crop.min_z", + "mirror_crop.max_x", + "mirror_crop.max_y", + "mirror_crop.max_z" + ], + "additionalProperties": false + } + }, + "properties": { + "/**": { + "type": "object", + "properties": { + "ros__parameters": { + "$ref": "#/definitions/cuda_pointcloud_preprocessor" + } + }, + "required": ["ros__parameters"], + "additionalProperties": false + } + }, + "required": ["/**"], + "additionalProperties": false +} diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp new file mode 100644 index 0000000000000..bd5717fbbcb5e --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp @@ -0,0 +1,203 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//    http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp" + +#include + +#include + +namespace autoware::cuda_organized_pointcloud_adapter +{ +using sensor_msgs::msg::PointCloud2; + +CudaOrganizedPointcloudAdapterNode::CudaOrganizedPointcloudAdapterNode( + const rclcpp::NodeOptions & node_options) +: Node("cuda_organized_pointcloud_adapter", node_options) +{ + pub_ = + std::make_unique>( + *this, "~/output/pointcloud"); + + rclcpp::SubscriptionOptions sub_options; + sub_options.use_intra_process_comm = rclcpp::IntraProcessSetting::Enable; + + pointcloud_sub_ = this->create_subscription( + "~/input/pointcloud", rclcpp::SensorDataQoS{}.keep_last(1), + std::bind(&CudaOrganizedPointcloudAdapterNode::pointcloudCallback, this, std::placeholders::_1), + sub_options); + + // initialize debug tool + { + using autoware::universe_utils::DebugPublisher; + using autoware::universe_utils::StopWatch; + + stop_watch_ptr_ = std::make_unique>(); + debug_publisher_ = std::make_unique(this, "cuda_organized_pointcloud_adapter"); + stop_watch_ptr_->tic("processing_time"); + } +} + +void CudaOrganizedPointcloudAdapterNode::estimatePointcloudRingInfo( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr) +{ + const autoware::point_types::PointXYZIRCAEDT * input_buffer = + reinterpret_cast( + input_pointcloud_msg_ptr->data.data()); + + std::size_t max_ring = 0; + + for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; + i++) { + const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i]; + const std::size_t ring = static_cast(point.channel); + max_ring = std::max(max_ring, ring); + } + + // Set max rings to the next power of two + num_rings_ = std::pow(2, std::ceil(std::log2(max_ring + 1))); + num_rings_ = std::max(num_rings_, static_cast(16)); + std::vector ring_points(num_rings_, 0); + + for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; + i++) { + const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i]; + const std::size_t ring = point.channel; + ring_points[ring]++; + } + + // Set max points per ring to the next multiple of 512 + max_points_per_ring_ = *std::max_element(ring_points.begin(), ring_points.end()); + max_points_per_ring_ = std::max(max_points_per_ring_, static_cast(512)); + max_points_per_ring_ = (max_points_per_ring_ + 511) / 512 * 512; + + next_ring_index_.resize(num_rings_); + std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0); + buffer_.resize(num_rings_ * max_points_per_ring_); + + /* if (device_buffer_ != nullptr) { + cudaFree(device_buffer_); + } */ + + device_buffer_ = cuda_blackboard::make_unique( + num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); + + /* cudaMalloc( + device_buffer_, + num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); */ + + RCLCPP_INFO_STREAM( + get_logger(), "Estimated rings: " << num_rings_ + << ", max_points_per_ring: " << max_points_per_ring_ + << ". This should only be done during the first iterations. " + "Otherwise, performance will be affected."); +} + +bool CudaOrganizedPointcloudAdapterNode::orderPointcloud( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr) +{ + const autoware::point_types::PointXYZIRCAEDT * input_buffer = + reinterpret_cast( + input_pointcloud_msg_ptr->data.data()); + + bool ring_overflow = false; + bool point_overflow = false; + + for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; + i++) { + const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i]; + const std::size_t raw_ring = point.channel; + const std::size_t ring = raw_ring % num_rings_; + + const std::size_t raw_index = next_ring_index_[ring]; + const std::size_t index = raw_index % max_points_per_ring_; + + ring_overflow |= raw_ring >= num_rings_; + point_overflow |= raw_index >= max_points_per_ring_; + + buffer_[ring * max_points_per_ring_ + index] = point; + next_ring_index_[ring] = (index + 1) % max_points_per_ring_; + } + + return !ring_overflow && !point_overflow; +} + +void CudaOrganizedPointcloudAdapterNode::pointcloudCallback( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr) +{ + stop_watch_ptr_->toc("processing_time", true); + + // TODO(knzo25): check the pointcloud layout at least once + + assert(input_pointcloud_msg_ptr->point_step == sizeof(autoware::point_types::PointXYZIRCAEDT)); + + if (num_rings_ == 0 || max_points_per_ring_ == 0) { + estimatePointcloudRingInfo(input_pointcloud_msg_ptr); + } + + if (!orderPointcloud(input_pointcloud_msg_ptr)) { + estimatePointcloudRingInfo(input_pointcloud_msg_ptr); + orderPointcloud(input_pointcloud_msg_ptr); + } + + // Copy to cuda memory + cudaMemcpy( + device_buffer_.get(), buffer_.data(), + num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT), + cudaMemcpyHostToDevice); + + auto cuda_pointcloud_msg_ptr = std::make_unique(); + cuda_pointcloud_msg_ptr->width = max_points_per_ring_; + cuda_pointcloud_msg_ptr->height = num_rings_; + cuda_pointcloud_msg_ptr->point_step = sizeof(autoware::point_types::PointXYZIRCAEDT); + cuda_pointcloud_msg_ptr->row_step = + max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT); + cuda_pointcloud_msg_ptr->data = + std::move(device_buffer_); /*reinterpret_cast(device_buffer_)*/ + ; + cuda_pointcloud_msg_ptr->is_dense = input_pointcloud_msg_ptr->is_dense; + cuda_pointcloud_msg_ptr->header = input_pointcloud_msg_ptr->header; + + pub_->publish(std::move(cuda_pointcloud_msg_ptr)); + + if (debug_publisher_) { + const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true); + debug_publisher_->publish( + "debug/processing_time_ms", processing_time_ms); + + double now_stamp_seconds = rclcpp::Time(this->get_clock()->now()).seconds(); + double cloud_stamp_seconds = rclcpp::Time(input_pointcloud_msg_ptr->header.stamp).seconds(); + + debug_publisher_->publish( + "debug/latency_ms", 1000.f * (now_stamp_seconds - cloud_stamp_seconds)); + } + + // Allocate cuda memory + device_buffer_ = cuda_blackboard::make_unique( + num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); + /* cudaMalloc( + &device_buffer_, + num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); */ + // Clear indexes + std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0); + + // Clear pointcloud buffer + std::fill(buffer_.begin(), buffer_.end(), autoware::point_types::PointXYZIRCAEDT{}); +} + +} // namespace autoware::cuda_organized_pointcloud_adapter + +#include "rclcpp_components/register_node_macro.hpp" +RCLCPP_COMPONENTS_REGISTER_NODE( + autoware::cuda_organized_pointcloud_adapter::CudaOrganizedPointcloudAdapterNode) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu new file mode 100644 index 0000000000000..ad8910aef5b76 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -0,0 +1,692 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//    http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp" +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace autoware::cuda_pointcloud_preprocessor +{ + +__host__ __device__ Eigen::Matrix3f skewSymmetric(const Eigen::Vector3f & v) +{ + Eigen::Matrix3f m; + m << 0, -v.z(), v.y(), v.z(), 0, -v.x(), -v.y(), v.x(), 0; + return m; +} + +__host__ __device__ Eigen::Matrix3f leftJacobianSO3(const Eigen::Vector3f & omega) +{ + double theta = omega.norm(); + if (std::abs(theta) < 1e-6) { + return Eigen::Matrix3f::Identity(); + } + + Eigen::Matrix3f Omega = skewSymmetric(omega); + + Eigen::Matrix3f Omega2 = Omega * Omega; + double theta2 = theta * theta; + double theta3 = theta2 * theta; + + // Rodrigues' formula for Jacobian + return Eigen::Matrix3f::Identity() + ((1 - cos(theta)) / theta2) * Omega + + ((theta - sin(theta)) / theta3) * Omega2; +} + +__host__ __device__ Eigen::Matrix4f transformationMatrixFromVelocity( + const Eigen::Vector3f & linear_velocity, const Eigen::Vector3f & angular_velocity, double dt) +{ + Eigen::Matrix3f R = Eigen::AngleAxisf(angular_velocity.norm() * dt, angular_velocity.normalized()) + .toRotationMatrix(); + Eigen::Matrix3f J = leftJacobianSO3(angular_velocity * dt); + + Eigen::Vector3f translation = J * (linear_velocity * dt); + + Eigen::Matrix4f transformation = Eigen::Matrix4f::Identity(); + transformation.block<3, 3>(0, 0) = R; + transformation.block<3, 1>(0, 3) = translation; + + return transformation; +} + +__global__ void transformPointsKernel( + const InputPointType * input_points, InputPointType * output_points, int num_points, + TransformStruct transform) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points) { + output_points[idx] = input_points[idx]; + + const float x = input_points[idx].x; + const float y = input_points[idx].y; + const float z = input_points[idx].z; + + output_points[idx].x = transform.m11 * x + transform.m12 * y + transform.m13 * z + transform.x; + output_points[idx].y = transform.m21 * x + transform.m22 * y + transform.m23 * z + transform.y; + output_points[idx].z = transform.m31 * x + transform.m32 * y + transform.m33 * z + transform.z; + } +} + +__global__ void cropBoxKernel( + InputPointType * d_points, uint32_t * output_mask, int num_points, float min_x, float min_y, + float min_z, float max_x, float max_y, float max_z) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points) { + const float x = d_points[idx].x; + const float y = d_points[idx].y; + const float z = d_points[idx].z; + + if (x > min_x && x < max_x && y > min_y && y < max_y && z > min_z && z < max_z) { + output_mask[idx] = 0; + } else { + output_mask[idx] = 1; + } + } +} + +__global__ void combineMasksKernel( + uint32_t * mask1, uint32_t * mask2, uint32_t * mask3, int num_points, uint32_t * output_mask) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points) { + output_mask[idx] = mask1[idx] & mask2[idx] & mask3[idx]; + } +} + +__global__ void extractInputPointIndicesKernel( + InputPointType * input_points, uint32_t * masks, uint32_t * indices, int num_points, + InputPointType * output_points) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points && masks[idx] == 1) { + output_points[indices[idx] - 1] = input_points[idx]; + } +} + +__global__ void extractOutputPointIndicesKernel( + OutputPointType * input_points, uint32_t * masks, uint32_t * indices, int num_points, + OutputPointType * output_points) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points && masks[idx] == 1) { + output_points[indices[idx] - 1] = input_points[idx]; + } +} + +__global__ void extractInputPointsToOutputPoints_indicesKernel( + InputPointType * input_points, uint32_t * masks, uint32_t * indices, int num_points, + OutputPointType * output_points) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points && masks[idx] == 1) { + InputPointType & input_point = input_points[idx]; + OutputPointType & output_point = output_points[indices[idx] - 1]; + output_point.x = input_point.x; + output_point.y = input_point.y; + output_point.z = input_point.z; + output_point.intensity = input_point.intensity; + output_point.return_type = input_point.return_type; + output_point.channel = input_point.channel; + } +} + +__global__ void undistort2dKernel( + InputPointType * input_points, int num_points, TwistStruct2D * twist_structs, int num_twists) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points) { + InputPointType & point = input_points[idx]; + + // The twist must always be newer than the point ! (or it was the last twist) + int twist_index = 0; + while (twist_index < num_twists && twist_structs[twist_index].stamp_nsec < point.time_stamp) { + twist_index++; + } + + twist_index = min(twist_index, num_twists - 1); + + TwistStruct2D twist = twist_structs[twist_index]; + float x = twist.cum_x; + float y = twist.cum_y; + float theta = twist.cum_theta; + + double dt_nsec = + point.time_stamp > twist.last_stamp_nsec ? point.time_stamp - twist.last_stamp_nsec : 0; + double dt = 1e-9 * (dt_nsec); + + theta += twist.vtheta * dt; + float d = twist.vx * dt; + x += d * cos(theta); + y += d * sin(theta); + + float distorted_x = point.x; + float distorted_y = point.y; + + point.x = distorted_x * cos(theta) - distorted_y * sin(theta) + x; + point.y = distorted_x * sin(theta) + distorted_y * cos(theta) + y; + } +} + +__global__ void undistort3dKernel( + InputPointType * input_points, int num_points, TwistStruct3D * twist_structs, int num_twists) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points) { + InputPointType & point = input_points[idx]; + + // The twist must always be newer than the point ! (or it was the last twist) + int twist_index = 0; + while (twist_index < num_twists && twist_structs[twist_index].stamp_nsec < point.time_stamp) { + twist_index++; + } + + twist_index = min(twist_index, num_twists - 1); + + TwistStruct3D twist = twist_structs[twist_index]; + Eigen::Map cum_transform_buffer_map(twist.cum_transform_buffer); + Eigen::Map v_map(twist.v); + Eigen::Map w_map(twist.w); + + double dt_nsec = + point.time_stamp > twist.last_stamp_nsec ? point.time_stamp - twist.last_stamp_nsec : 0; + double dt = 1e-9 * (dt_nsec); + + Eigen::Matrix4f transform = + cum_transform_buffer_map * transformationMatrixFromVelocity(v_map, w_map, dt); + Eigen::Vector3f p(point.x, point.y, point.z); + p = transform.block<3, 3>(0, 0) * p + transform.block<3, 1>(0, 3); + + point.x = p.x(); + point.y = p.y(); + point.z = p.z(); + } +} + +__global__ void ringOutlierFilterKernel( + const InputPointType * d_points, uint32_t * output_mask, int num_rings, int max_points_per_ring, + float distance_ratio, float object_length_threshold_squared, int num_points_threshold) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int j = idx / max_points_per_ring; + int i = idx % max_points_per_ring; + + if (j >= num_rings || i >= max_points_per_ring) { + return; + } + + int min_i = max(i - num_points_threshold, 0); + int max_i = min(i + num_points_threshold, max_points_per_ring); + + int walk_size = 1; + int left_idx = min_i; + int right_idx = min_i + 1; + + for (int k = min_i; k < max_i - 1; k++) { + const InputPointType & left_point = d_points[j * max_points_per_ring + k]; + const InputPointType & right_point = d_points[j * max_points_per_ring + k + 1]; + + // Find biggest walk that passes through i + float azimuth_diff = right_point.azimuth - left_point.azimuth; + azimuth_diff = azimuth_diff < 0.f ? azimuth_diff + 2 * M_PI : azimuth_diff; + + if ( + max(left_point.distance, right_point.distance) < + min(left_point.distance, right_point.distance) * distance_ratio && + azimuth_diff < 1.f * M_PI / 180.f) { + // Determined to be included in the same walk + walk_size++; + right_idx++; + } else if (k >= i) { + break; + } else { + walk_size = 1; + left_idx = k + 1; + right_idx = k + 2; // this is safe since we break if k >= i + } + } + + const InputPointType & left_point = d_points[j * max_points_per_ring + left_idx]; + const InputPointType & right_point = d_points[j * max_points_per_ring + right_idx - 1]; + const float x = left_point.x - right_point.x; + const float y = left_point.y - right_point.y; + const float z = left_point.z - right_point.z; + + output_mask[j * max_points_per_ring + i] = + ((walk_size > num_points_threshold) || + (x * x + y * y + z * z >= object_length_threshold_squared)) + ? 1 + : 0; +} + +__global__ void transformPointTypeKernel( + const InputPointType * device_input_points, int num_points, OutputPointType * device_ouput_points) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points) { + const InputPointType & input_point = device_input_points[idx]; + OutputPointType & output_point = device_ouput_points[idx]; + + output_point.x = input_point.x; + output_point.y = input_point.y; + output_point.z = input_point.z; + output_point.intensity = (float)input_point.intensity; + } +} + +CudaPointcloudPreprocessor::CudaPointcloudPreprocessor() +{ + sensor_msgs::msg::PointField x_field, y_field, z_field, intensity_field, return_type_field, + channel_field, azimuth_field, elevation_field, distance_field, time_stamp_field; + x_field.name = "x"; + x_field.offset = 0; + x_field.datatype = sensor_msgs::msg::PointField::FLOAT32; + x_field.count = 1; + + y_field.name = "y"; + y_field.offset = 4; + y_field.datatype = sensor_msgs::msg::PointField::FLOAT32; + y_field.count = 1; + + z_field.name = "z"; + z_field.offset = 8; + z_field.datatype = sensor_msgs::msg::PointField::FLOAT32; + z_field.count = 1; + + intensity_field.name = "intensity"; + intensity_field.offset = 12; + intensity_field.datatype = sensor_msgs::msg::PointField::UINT8; + intensity_field.count = 1; + + return_type_field.name = "return_type"; + return_type_field.offset = 13; + return_type_field.datatype = sensor_msgs::msg::PointField::UINT8; + return_type_field.count = 1; + + channel_field.name = "channel"; + channel_field.offset = 14; + channel_field.datatype = sensor_msgs::msg::PointField::UINT16; + channel_field.count = 1; + + static_assert(sizeof(OutputPointType) == 16, "OutputPointType size is not 16 bytes"); + static_assert(offsetof(OutputPointType, x) == 0); + static_assert(offsetof(OutputPointType, y) == 4); + static_assert(offsetof(OutputPointType, z) == 8); + static_assert(offsetof(OutputPointType, intensity) == 12); + static_assert(offsetof(OutputPointType, return_type) == 13); + static_assert(offsetof(OutputPointType, channel) == 14); + + point_fields_.push_back(x_field); + point_fields_.push_back(y_field); + point_fields_.push_back(z_field); + point_fields_.push_back(intensity_field); + point_fields_.push_back(return_type_field); + point_fields_.push_back(channel_field); +} + +void CudaPointcloudPreprocessor::setCropBoxParameters( + const CropBoxParameters & self_crop_box_parameters, + const CropBoxParameters & mirror_crop_box_parameters) +{ + self_crop_box_parameters_ = self_crop_box_parameters; + mirror_crop_box_parameters_ = mirror_crop_box_parameters; +} + +void CudaPointcloudPreprocessor::setRingOutlierFilterParameters( + const RingOutlierFilterParameters & ring_outlier_parameters) +{ + ring_outlier_parameters_ = ring_outlier_parameters; +} + +void CudaPointcloudPreprocessor::set3DUndistortion(bool use_3d_undistortion) +{ + use_3d_undistortion_ = use_3d_undistortion; +} + +void CudaPointcloudPreprocessor::preallocateOutput() +{ + output_pointcloud_ptr_ = std::make_unique(); + output_pointcloud_ptr_->data = cuda_blackboard::make_unique( + max_rings_ * max_points_per_ring_ * sizeof(OutputPointType)); +} + +void CudaPointcloudPreprocessor::setupTwist2DStructs( + const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, + const std::deque & twist_queue, + const std::deque & angular_velocity_queue) +{ + const InputPointType * device_input_points = + reinterpret_cast(input_pointcloud_msg.data.get()); + InputPointType first_point; + cudaMemcpy(&first_point, &device_input_points[0], sizeof(InputPointType), cudaMemcpyDeviceToHost); + + // Twist preprocessing + + uint64_t pointcloud_stamp_nsec = 1'000'000'000 * input_pointcloud_msg.header.stamp.sec + + input_pointcloud_msg.header.stamp.nanosec; + + thrust::host_vector host_twist_structs; + + float cum_x = 0; + float cum_y = 0; + float cum_theta = 0; + // All time stamps from now on are in nsec from the "beginning of the pointcloud" + uint32_t last_stamp_nsec = first_point.time_stamp; + + std::size_t twist_index = 0; + std::size_t angular_velocity_index = 0; + + for (; twist_index < twist_queue.size() || + angular_velocity_index < angular_velocity_queue.size();) { + uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec; + float vx, vtheta; + + if (twist_index < twist_queue.size()) { + input_twist_global_stamp_nsec = + 1'000'000'000 * static_cast(twist_queue[twist_index].header.stamp.sec) + + static_cast(twist_queue[twist_index].header.stamp.nanosec); + vx = twist_queue[twist_index].twist.twist.linear.x; + } else { + input_twist_global_stamp_nsec = std::numeric_limits::max(); + vx = 0.0; + } + + if (angular_velocity_index < angular_velocity_queue.size()) { + angular_velocity_global_stamp_nsec = + 1'000'000'000 * + static_cast(angular_velocity_queue[angular_velocity_index].header.stamp.sec) + + static_cast(angular_velocity_queue[angular_velocity_index].header.stamp.nanosec); + vtheta = angular_velocity_queue[angular_velocity_index].vector.z; + } else { + angular_velocity_global_stamp_nsec = std::numeric_limits::max(); + vtheta = 0.0; + } + + if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) { + twist_stamp = input_twist_global_stamp_nsec; + twist_index++; + } else if (input_twist_global_stamp_nsec > angular_velocity_global_stamp_nsec) { + twist_stamp = angular_velocity_global_stamp_nsec; + angular_velocity_index++; + } else { + twist_index++; + angular_velocity_index++; + } + + TwistStruct2D twist; + twist.cum_x = cum_x; + twist.cum_y = cum_y; + twist.cum_theta = cum_theta; + + uint64_t twist_global_stamp_nsec = twist_stamp; + assert(twist_global_stamp_nsec > pointcloud_stamp_nsec); // by construction + uint32_t twist_from_pointcloud_start_nsec = twist_global_stamp_nsec - pointcloud_stamp_nsec; + + twist.stamp_nsec = twist_from_pointcloud_start_nsec; + twist.vx = vx; + twist.vtheta = vtheta; + twist.last_stamp_nsec = last_stamp_nsec; + host_twist_structs.push_back(twist); + + double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec); + last_stamp_nsec = twist.stamp_nsec; + cum_theta += vtheta * dt_seconds; + float d = twist.vx * dt_seconds; + cum_x += d * cos(cum_theta); + cum_y += d * sin(cum_theta); + } + + // Copy to device + device_twist_2d_structs_ = host_twist_structs; +} + +void CudaPointcloudPreprocessor::setupTwist3DStructs( + const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, + const std::deque & twist_queue, + const std::deque & angular_velocity_queue) +{ + const InputPointType * device_input_points = + reinterpret_cast(input_pointcloud_msg.data.get()); + InputPointType first_point; + cudaMemcpy(&first_point, &device_input_points[0], sizeof(InputPointType), cudaMemcpyDeviceToHost); + + // Twist preprocessing + + uint64_t pointcloud_stamp_nsec = 1'000'000'000 * input_pointcloud_msg.header.stamp.sec + + input_pointcloud_msg.header.stamp.nanosec; + + thrust::host_vector host_twist_structs; + + Eigen::Matrix4f cum_transform = Eigen::Matrix4f::Identity(); + Eigen::Vector3f v = Eigen::Vector3f::Zero(); + Eigen::Vector3f w = Eigen::Vector3f::Zero(); + + // All time stamps from now on are in nsec from the "beginning of the pointcloud" + uint32_t last_stamp_nsec = first_point.time_stamp; + + std::size_t twist_index = 0; + std::size_t angular_velocity_index = 0; + + for (; twist_index < twist_queue.size() || + angular_velocity_index < angular_velocity_queue.size();) { + uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec; + + if (twist_index < twist_queue.size()) { + input_twist_global_stamp_nsec = + 1'000'000'000 * static_cast(twist_queue[twist_index].header.stamp.sec) + + static_cast(twist_queue[twist_index].header.stamp.nanosec); + v.x() = twist_queue[twist_index].twist.twist.linear.x; + v.y() = twist_queue[twist_index].twist.twist.linear.y; + v.z() = twist_queue[twist_index].twist.twist.linear.z; + } else { + input_twist_global_stamp_nsec = std::numeric_limits::max(); + v = Eigen::Vector3f::Zero(); + } + + if (angular_velocity_index < angular_velocity_queue.size()) { + angular_velocity_global_stamp_nsec = + 1'000'000'000 * + static_cast(angular_velocity_queue[angular_velocity_index].header.stamp.sec) + + static_cast(angular_velocity_queue[angular_velocity_index].header.stamp.nanosec); + w.x() = angular_velocity_queue[angular_velocity_index].vector.x; + w.y() = angular_velocity_queue[angular_velocity_index].vector.y; + w.z() = angular_velocity_queue[angular_velocity_index].vector.z; + } else { + angular_velocity_global_stamp_nsec = std::numeric_limits::max(); + w = Eigen::Vector3f::Zero(); + } + + if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) { + twist_stamp = input_twist_global_stamp_nsec; + twist_index++; + } else if (input_twist_global_stamp_nsec > angular_velocity_global_stamp_nsec) { + twist_stamp = angular_velocity_global_stamp_nsec; + angular_velocity_index++; + } else { + twist_index++; + angular_velocity_index++; + } + + TwistStruct3D twist; + + Eigen::Map cum_transform_buffer_map(twist.cum_transform_buffer); + Eigen::Map v_map(twist.v); + Eigen::Map w_map(twist.w); + cum_transform_buffer_map = cum_transform; + + uint64_t twist_global_stamp_nsec = twist_stamp; + assert(twist_global_stamp_nsec > pointcloud_stamp_nsec); // by construction + uint32_t twist_from_pointcloud_start_nsec = twist_global_stamp_nsec - pointcloud_stamp_nsec; + + twist.stamp_nsec = twist_from_pointcloud_start_nsec; + v_map = v; + w_map = w; + twist.last_stamp_nsec = last_stamp_nsec; + host_twist_structs.push_back(twist); + + double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec); + last_stamp_nsec = twist.stamp_nsec; + + auto delta_transform = transformationMatrixFromVelocity(v, w, dt_seconds); + cum_transform = cum_transform * delta_transform; + } + + // Copy to device + device_twist_3d_structs_ = host_twist_structs; +} + +std::unique_ptr CudaPointcloudPreprocessor::process( + const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, + const geometry_msgs::msg::TransformStamped & transform_msg, + const std::deque & twist_queue, + const std::deque & angular_velocity_queue) +{ + auto frame_id = input_pointcloud_msg.header.frame_id; + + auto num_input_points = input_pointcloud_msg.width * input_pointcloud_msg.height; + if ( + input_pointcloud_msg.width * input_pointcloud_msg.height > max_rings_ * max_points_per_ring_) { + max_rings_ = input_pointcloud_msg.height; + max_points_per_ring_ = input_pointcloud_msg.width; + + device_transformed_points_.resize(num_input_points); + device_self_crop_mask_.resize(num_input_points); + device_mirror_crop_mask_.resize(num_input_points); + device_ring_outlier_mask_.resize(num_input_points); + device_indices_.resize(num_input_points); + + preallocateOutput(); + } + + tf2::Quaternion rotation_quaternion( + transform_msg.transform.rotation.x, transform_msg.transform.rotation.y, + transform_msg.transform.rotation.z, transform_msg.transform.rotation.w); + tf2::Matrix3x3 rotation_matrix; + rotation_matrix.setRotation(rotation_quaternion); + + TransformStruct transform_struct; + transform_struct.x = static_cast(transform_msg.transform.translation.x); + transform_struct.y = static_cast(transform_msg.transform.translation.y); + transform_struct.z = static_cast(transform_msg.transform.translation.z); + transform_struct.m11 = static_cast(rotation_matrix[0][0]); + transform_struct.m12 = static_cast(rotation_matrix[0][1]); + transform_struct.m13 = static_cast(rotation_matrix[0][2]); + transform_struct.m21 = static_cast(rotation_matrix[1][0]); + transform_struct.m22 = static_cast(rotation_matrix[1][1]); + transform_struct.m23 = static_cast(rotation_matrix[1][2]); + transform_struct.m31 = static_cast(rotation_matrix[2][0]); + transform_struct.m32 = static_cast(rotation_matrix[2][1]); + transform_struct.m33 = static_cast(rotation_matrix[2][2]); + + // Twist preprocessing + const InputPointType * device_input_points = + reinterpret_cast(input_pointcloud_msg.data.get()); + + if (use_3d_undistortion_) { + setupTwist3DStructs(input_pointcloud_msg, twist_queue, angular_velocity_queue); + } else { + setupTwist2DStructs(input_pointcloud_msg, twist_queue, angular_velocity_queue); + } + + // Obtain raw pointers for the kernels + TwistStruct2D * device_twist_2d_structs = + thrust::raw_pointer_cast(device_twist_2d_structs_.data()); + TwistStruct3D * device_twist_3d_structs = + thrust::raw_pointer_cast(device_twist_3d_structs_.data()); + InputPointType * device_transformed_points = + thrust::raw_pointer_cast(device_transformed_points_.data()); + uint32_t * device_self_crop_mask = thrust::raw_pointer_cast(device_self_crop_mask_.data()); + uint32_t * device_mirror_crop_mask = thrust::raw_pointer_cast(device_mirror_crop_mask_.data()); + uint32_t * device_ring_outlier_mask = thrust::raw_pointer_cast(device_ring_outlier_mask_.data()); + uint32_t * device_indices = thrust::raw_pointer_cast(device_indices_.data()); + + const int threadsPerBlock = 256; + const int blocksPerGrid = (num_input_points + threadsPerBlock - 1) / threadsPerBlock; + + transformPointsKernel<<>>( + device_input_points, device_transformed_points, num_input_points, transform_struct); + + cropBoxKernel<<>>( + device_transformed_points, device_self_crop_mask, num_input_points, + self_crop_box_parameters_.min_x, self_crop_box_parameters_.min_y, + self_crop_box_parameters_.min_z, self_crop_box_parameters_.max_x, + self_crop_box_parameters_.max_y, self_crop_box_parameters_.max_z); + + cropBoxKernel<<>>( + device_transformed_points, device_mirror_crop_mask, num_input_points, + mirror_crop_box_parameters_.min_x, mirror_crop_box_parameters_.min_y, + mirror_crop_box_parameters_.min_z, mirror_crop_box_parameters_.max_x, + mirror_crop_box_parameters_.max_y, mirror_crop_box_parameters_.max_z); + + if (use_3d_undistortion_ && device_twist_3d_structs_.size() > 0) { + undistort3dKernel<<>>( + device_transformed_points, num_input_points, device_twist_3d_structs, + device_twist_3d_structs_.size()); + } else if (!use_3d_undistortion_ && device_twist_2d_structs_.size() > 0) { + undistort2dKernel<<>>( + device_transformed_points, num_input_points, device_twist_2d_structs, + device_twist_2d_structs_.size()); + } + + ringOutlierFilterKernel<<>>( + device_transformed_points, device_ring_outlier_mask, max_rings_, max_points_per_ring_, + ring_outlier_parameters_.distance_ratio, + ring_outlier_parameters_.object_length_threshold * + ring_outlier_parameters_.object_length_threshold, + ring_outlier_parameters_.num_points_threshold); + + combineMasksKernel<<>>( + device_self_crop_mask, device_mirror_crop_mask, device_ring_outlier_mask, num_input_points, + device_ring_outlier_mask); + + thrust::inclusive_scan( + thrust::device, device_ring_outlier_mask, device_ring_outlier_mask + num_input_points, + device_indices); + + uint32_t num_output_points; + cudaMemcpy( + &num_output_points, device_indices + num_input_points - 1, sizeof(uint32_t), + cudaMemcpyDeviceToHost); + + if (num_output_points > 0) { + extractInputPointsToOutputPoints_indicesKernel<<>>( + device_transformed_points, device_ring_outlier_mask, device_indices, num_input_points, + reinterpret_cast(output_pointcloud_ptr_->data.get())); + } + + // Copy the transformed points back + output_pointcloud_ptr_->row_step = num_output_points * sizeof(OutputPointType); + output_pointcloud_ptr_->width = num_output_points; + output_pointcloud_ptr_->height = 1; + + output_pointcloud_ptr_->fields = point_fields_; + output_pointcloud_ptr_->is_dense = true; + output_pointcloud_ptr_->is_bigendian = input_pointcloud_msg.is_bigendian; + output_pointcloud_ptr_->point_step = sizeof(OutputPointType); + output_pointcloud_ptr_->header.stamp = input_pointcloud_msg.header.stamp; + + return std::move(output_pointcloud_ptr_); +} + +} // namespace autoware::cuda_pointcloud_preprocessor diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp new file mode 100644 index 0000000000000..fdfae020f1e19 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp @@ -0,0 +1,253 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//    http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp" + +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" + +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +namespace autoware::cuda_pointcloud_preprocessor +{ +using sensor_msgs::msg::PointCloud2; + +CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode( + const rclcpp::NodeOptions & node_options) +: Node("cuda_pointcloud_preprocessor", node_options), + tf2_buffer_(this->get_clock()), + tf2_listener_(tf2_buffer_) +{ + using std::placeholders::_1; + + // Parameters + base_frame_ = declare_parameter("base_frame"); + + RingOutlierFilterParameters ring_outlier_filter_parameters; + ring_outlier_filter_parameters.distance_ratio = declare_parameter("distance_ratio"); + ring_outlier_filter_parameters.object_length_threshold = + declare_parameter("object_length_threshold"); + ring_outlier_filter_parameters.num_points_threshold = + declare_parameter("num_points_threshold"); + + CropBoxParameters self_crop_box_parameters, mirror_crop_box_parameters; + self_crop_box_parameters.min_x = declare_parameter("self_crop.min_x"); + self_crop_box_parameters.min_y = declare_parameter("self_crop.min_y"); + self_crop_box_parameters.min_z = declare_parameter("self_crop.min_z"); + self_crop_box_parameters.max_x = declare_parameter("self_crop.max_x"); + self_crop_box_parameters.max_y = declare_parameter("self_crop.max_y"); + self_crop_box_parameters.max_z = declare_parameter("self_crop.max_z"); + + mirror_crop_box_parameters.min_x = declare_parameter("mirror_crop.min_x"); + mirror_crop_box_parameters.min_y = declare_parameter("mirror_crop.min_y"); + mirror_crop_box_parameters.min_z = declare_parameter("mirror_crop.min_z"); + mirror_crop_box_parameters.max_x = declare_parameter("mirror_crop.max_x"); + mirror_crop_box_parameters.max_y = declare_parameter("mirror_crop.max_y"); + mirror_crop_box_parameters.max_z = declare_parameter("mirror_crop.max_z"); + + bool use_3d_undistortion = declare_parameter("use_3d_distortion_correction"); + bool use_imu = declare_parameter("use_imu"); + + // Subscriber + sub_ = + std::make_unique>( + *this, "~/input/pointcloud", false, + std::bind(&CudaPointcloudPreprocessorNode::pointcloudCallback, this, _1)); + + // Publisher + pub_ = + std::make_unique>( + *this, "~/output/pointcloud"); + twist_sub_ = this->create_subscription( + "~/input/twist", 10, + std::bind(&CudaPointcloudPreprocessorNode::twistCallback, this, std::placeholders::_1)); + + if (use_imu) { + imu_sub_ = this->create_subscription( + "~/input/imu", 10, + std::bind(&CudaPointcloudPreprocessorNode::imuCallback, this, std::placeholders::_1)); + } + + cuda_pointcloud_preprocessor_ = std::make_unique(); + cuda_pointcloud_preprocessor_->setRingOutlierFilterParameters(ring_outlier_filter_parameters); + cuda_pointcloud_preprocessor_->setCropBoxParameters( + self_crop_box_parameters, mirror_crop_box_parameters); + cuda_pointcloud_preprocessor_->set3DUndistortion(use_3d_undistortion); + + // initialize debug tool + { + using autoware::universe_utils::DebugPublisher; + using autoware::universe_utils::StopWatch; + + stop_watch_ptr_ = std::make_unique>(); + debug_publisher_ = std::make_unique(this, "cuda_pointcloud_preprocessor"); + stop_watch_ptr_->tic("processing_time"); + } +} + +bool CudaPointcloudPreprocessorNode::getTransform( + const std::string & target_frame, const std::string & source_frame, + tf2::Transform * tf2_transform_ptr) +{ + if (target_frame == source_frame) { + tf2_transform_ptr->setOrigin(tf2::Vector3(0.0, 0.0, 0.0)); + tf2_transform_ptr->setRotation(tf2::Quaternion(0.0, 0.0, 0.0, 1.0)); + return true; + } + + try { + const auto transform_msg = + tf2_buffer_.lookupTransform(target_frame, source_frame, tf2::TimePointZero); + tf2::convert(transform_msg.transform, *tf2_transform_ptr); + } catch (const tf2::TransformException & ex) { + RCLCPP_WARN(get_logger(), "%s", ex.what()); + RCLCPP_ERROR( + get_logger(), "Please publish TF %s to %s", target_frame.c_str(), source_frame.c_str()); + + tf2_transform_ptr->setOrigin(tf2::Vector3(0.0, 0.0, 0.0)); + tf2_transform_ptr->setRotation(tf2::Quaternion(0.0, 0.0, 0.0, 1.0)); + return false; + } + return true; +} + +void CudaPointcloudPreprocessorNode::twistCallback( + const geometry_msgs::msg::TwistWithCovarianceStamped::ConstSharedPtr twist_msg_ptr) +{ + twist_queue_.push_back(*twist_msg_ptr); + + while (!twist_queue_.empty()) { + // for replay rosbag + if ( + rclcpp::Time(twist_queue_.front().header.stamp) > rclcpp::Time(twist_msg_ptr->header.stamp)) { + twist_queue_.pop_front(); + } else if ( // NOLINT + rclcpp::Time(twist_queue_.front().header.stamp) < + rclcpp::Time(twist_msg_ptr->header.stamp) - rclcpp::Duration::from_seconds(1.0)) { + twist_queue_.pop_front(); + } + break; + } +} + +void CudaPointcloudPreprocessorNode::imuCallback( + const sensor_msgs::msg::Imu::ConstSharedPtr imu_msg) +{ + tf2::Transform tf2_imu_link_to_base_link{}; + getTransform(base_frame_, imu_msg->header.frame_id, &tf2_imu_link_to_base_link); + geometry_msgs::msg::TransformStamped::SharedPtr tf_base2imu_ptr = + std::make_shared(); + tf_base2imu_ptr->transform.rotation = tf2::toMsg(tf2_imu_link_to_base_link.getRotation()); + + geometry_msgs::msg::Vector3Stamped angular_velocity; + angular_velocity.vector = imu_msg->angular_velocity; + + geometry_msgs::msg::Vector3Stamped transformed_angular_velocity; + tf2::doTransform(angular_velocity, transformed_angular_velocity, *tf_base2imu_ptr); + transformed_angular_velocity.header = imu_msg->header; + angular_velocity_queue_.push_back(transformed_angular_velocity); + + while (!angular_velocity_queue_.empty()) { + // for rosbag replay + if ( + rclcpp::Time(angular_velocity_queue_.front().header.stamp) > + rclcpp::Time(imu_msg->header.stamp)) { + angular_velocity_queue_.pop_front(); + } else if ( // NOLINT + rclcpp::Time(angular_velocity_queue_.front().header.stamp) < + rclcpp::Time(imu_msg->header.stamp) - rclcpp::Duration::from_seconds(1.0)) { + angular_velocity_queue_.pop_front(); + } + break; + } +} + +void CudaPointcloudPreprocessorNode::pointcloudCallback( + const std::shared_ptr input_pointcloud_msg_ptr) +{ + static_assert( + sizeof(InputPointType) == sizeof(autoware::point_types::PointXYZIRCAEDT), + "PointStruct and PointXYZIRCAEDT must have the same size"); + + stop_watch_ptr_->toc("processing_time", true); + + // Make sure that the first twist is newer than the first point + InputPointType first_point; + cudaMemcpy( + &first_point, input_pointcloud_msg_ptr->data.get(), sizeof(InputPointType), + cudaMemcpyDeviceToHost); + double first_point_stamp = input_pointcloud_msg_ptr->header.stamp.sec + + input_pointcloud_msg_ptr->header.stamp.nanosec * 1e-9 + + first_point.time_stamp * 1e-9; + + while (twist_queue_.size() > 1 && + rclcpp::Time(twist_queue_.front().header.stamp).seconds() < first_point_stamp) { + twist_queue_.pop_front(); + } + + while (angular_velocity_queue_.size() > 1 && + rclcpp::Time(angular_velocity_queue_.front().header.stamp).seconds() < first_point_stamp) { + angular_velocity_queue_.pop_front(); + } + + // Obtain the base link to input pointcloud transform + geometry_msgs::msg::TransformStamped transform_msg; + + try { + transform_msg = tf2_buffer_.lookupTransform( + base_frame_, input_pointcloud_msg_ptr->header.frame_id, tf2::TimePointZero); + } catch (const tf2::TransformException & ex) { + RCLCPP_WARN(get_logger(), "%s", ex.what()); + return; + } + + auto output_pointcloud_ptr = cuda_pointcloud_preprocessor_->process( + *input_pointcloud_msg_ptr, transform_msg, twist_queue_, angular_velocity_queue_); + output_pointcloud_ptr->header.frame_id = base_frame_; + + // Publish + pub_->publish(std::move(output_pointcloud_ptr)); + + if (debug_publisher_) { + const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true); + debug_publisher_->publish( + "debug/processing_time_ms", processing_time_ms); + + double now_stamp_seconds = rclcpp::Time(this->get_clock()->now()).seconds(); + double cloud_stamp_seconds = rclcpp::Time(input_pointcloud_msg_ptr->header.stamp).seconds(); + + debug_publisher_->publish( + "debug/latency_ms", 1000.f * (now_stamp_seconds - cloud_stamp_seconds)); + } + + // Preallocate for next iteration + cuda_pointcloud_preprocessor_->preallocateOutput(); +} + +} // namespace autoware::cuda_pointcloud_preprocessor + +#include "rclcpp_components/register_node_macro.hpp" +RCLCPP_COMPONENTS_REGISTER_NODE( + autoware::cuda_pointcloud_preprocessor::CudaPointcloudPreprocessorNode) From 774e0990b417ceca330fc0893738c7f43f72874d Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa Date: Mon, 25 Nov 2024 18:05:51 +0900 Subject: [PATCH 02/38] chore: fixed incorrect links Signed-off-by: Kenzo Lobos-Tsunekawa --- .../docs/cuda-pointcloud-preprocessor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md index 459f15569d7ca..a82d49ba3d6ca 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md +++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md @@ -11,7 +11,7 @@ In particular, this node implements: ## Inner-workings / Algorithms -As this node reimplements the functionalities of the CPU-version algorithms, please have a look at the documentations of [crop-box](/sensing/autoware_pointcloud_preprocessor/docs/crop-box-filter.md), [distortion correction](/sensing/autoware_pointcloud_preprocessor/docs/distortion-corrector.md), and [ring-based outlier filter](/sensing/autoware_pointcloud_preprocessor/docs/ring-outlier-filter.md) for more information about these algorithms. +As this node reimplements the functionalities of the CPU-version algorithms, please have a look at the documentations of [crop-box](../../autoware_pointcloud_preprocessor/docs/crop-box-filter.md), [distortion correction](../../sensing/autoware_pointcloud_preprocessor/docs/distortion-corrector.md), and [ring-based outlier filter](../../sensing/autoware_pointcloud_preprocessor/docs/ring-outlier-filter.md) for more information about these algorithms. In addition to the individual algorithms previously mentioned, this node uses the `cuda_blackboard`, a cuda transport layer that enables a zero-copy mechanism between GPU and GPU memory for both input and output. From be04f760fe40da54c03ca7b5546bf37f2012ac5d Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa Date: Mon, 25 Nov 2024 18:09:23 +0900 Subject: [PATCH 03/38] chore: fixed dead links pt2 Signed-off-by: Kenzo Lobos-Tsunekawa --- .../docs/cuda-pointcloud-preprocessor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md index a82d49ba3d6ca..69d800794701b 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md +++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md @@ -11,7 +11,7 @@ In particular, this node implements: ## Inner-workings / Algorithms -As this node reimplements the functionalities of the CPU-version algorithms, please have a look at the documentations of [crop-box](../../autoware_pointcloud_preprocessor/docs/crop-box-filter.md), [distortion correction](../../sensing/autoware_pointcloud_preprocessor/docs/distortion-corrector.md), and [ring-based outlier filter](../../sensing/autoware_pointcloud_preprocessor/docs/ring-outlier-filter.md) for more information about these algorithms. +As this node reimplements the functionalities of the CPU-version algorithms, please have a look at the documentations of [crop-box](../../autoware_pointcloud_preprocessor/docs/crop-box-filter.md), [distortion correction](../../autoware_pointcloud_preprocessor/docs/distortion-corrector.md), and [ring-based outlier filter](../../autoware_pointcloud_preprocessor/docs/ring-outlier-filter.md) for more information about these algorithms. In addition to the individual algorithms previously mentioned, this node uses the `cuda_blackboard`, a cuda transport layer that enables a zero-copy mechanism between GPU and GPU memory for both input and output. From db02ec78f27b2da0f94b0e06b231cc79881bd25d Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa Date: Mon, 25 Nov 2024 18:21:44 +0900 Subject: [PATCH 04/38] chore: fixed spelling errors Signed-off-by: Kenzo Lobos-Tsunekawa --- .../CMakeLists.txt | 2 +- .../README.md | 2 +- .../cuda_pointcloud_preprocessor.hpp | 12 ++-------- .../cuda_pointcloud_preprocessor_node.hpp | 1 - .../cuda_pointcloud_preprocessor.cu | 22 +++++++++---------- 5 files changed, 15 insertions(+), 24 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt index 06b1f5d67540f..d09f787a3b277 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt +++ b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt @@ -32,7 +32,7 @@ include_directories( ${CUDA_INCLUDE_DIRS} ) -list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 20012") +list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 20012") # cSpell: ignore expt cuda_add_library(cuda_pointcloud_preprocessor_lib SHARED src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/README.md b/sensing/autoware_cuda_pointcloud_preprocessor/README.md index d15082b69a661..25d29a678b1ce 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/README.md +++ b/sensing/autoware_cuda_pointcloud_preprocessor/README.md @@ -4,7 +4,7 @@ The pointcloud preprocessing implemented in `autoware_pointcloud_preprocessor` has been thoroughly tested in autoware. However, the latency it introduces does not scale well with modern LiDAR devices due to the high number of points they introduce. -To alleviate this issue, this package reimplements most of the pipeline presented in `autoware_pointcloud_preprocessor` leveraging the use of GPGPUs. In particular, this package makes use of CUDA to provide accelerated versions of the already establishes implementations, while also maintaining compatibility with normal ROS nodes/topics. +To alleviate this issue, this package reimplements most of the pipeline presented in `autoware_pointcloud_preprocessor` leveraging the use of GPGPUs. In particular, this package makes use of CUDA to provide accelerated versions of the already establishes implementations, while also maintaining compatibility with normal ROS nodes/topics.