diff --git a/perception/image_projection_based_fusion/CMakeLists.txt b/perception/image_projection_based_fusion/CMakeLists.txt
index 0f4cb7112f74a..b6dfbaf402cc8 100644
--- a/perception/image_projection_based_fusion/CMakeLists.txt
+++ b/perception/image_projection_based_fusion/CMakeLists.txt
@@ -127,6 +127,7 @@ if(TRT_AVAIL AND CUDA_AVAIL AND CUDNN_AVAIL)
 
   ament_auto_add_library(pointpainting_lib SHARED
     src/pointpainting_fusion/node.cpp
+    src/pointpainting_fusion/pointcloud_densification.cpp
     src/pointpainting_fusion/pointpainting_trt.cpp
     src/pointpainting_fusion/voxel_generator.cpp
   )
diff --git a/perception/image_projection_based_fusion/config/pointpainting.param.yaml b/perception/image_projection_based_fusion/config/pointpainting.param.yaml
index 30136bc8f47d0..9227e4fa9319a 100644
--- a/perception/image_projection_based_fusion/config/pointpainting.param.yaml
+++ b/perception/image_projection_based_fusion/config/pointpainting.param.yaml
@@ -5,6 +5,7 @@
     head_onnx_path: "$(var model_path)/pts_backbone_neck_head_$(var model_name).onnx"
     head_engine_path: "$(var model_path)/pts_backbone_neck_head_$(var model_name).engine"
     trt_precision: fp16
+    cloud_capacity: 2000000
     post_process_params:
       # post-process params
       circle_nms_dist_threshold: 0.3
diff --git a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointcloud_densification.hpp b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointcloud_densification.hpp
new file mode 100644
index 0000000000000..03609eb18e689
--- /dev/null
+++ b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointcloud_densification.hpp
@@ -0,0 +1,72 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IMAGE_PROJECTION_BASED_FUSION__POINTPAINTING_FUSION__POINTCLOUD_DENSIFICATION_HPP_
+#define IMAGE_PROJECTION_BASED_FUSION__POINTPAINTING_FUSION__POINTCLOUD_DENSIFICATION_HPP_
+
+#include <tf2_ros/buffer.h>
+#include <tf2_ros/transform_listener.h>
+#ifdef ROS_DISTRO_GALACTIC
+#include <tf2_sensor_msgs/tf2_sensor_msgs.h>
+#else
+#include <tf2_sensor_msgs/tf2_sensor_msgs.hpp>
+#endif
+
+#include <lidar_centerpoint/preprocess/pointcloud_densification.hpp>
+
+#include <list>
+#include <string>
+#include <utility>
+
+namespace image_projection_based_fusion
+{
+struct PointCloudWithTransform
+{
+  sensor_msgs::msg::PointCloud2 pointcloud_msg;
+  Eigen::Affine3f affine_past2world;
+};
+
+class PointCloudDensification
+{
+public:
+  explicit PointCloudDensification(const centerpoint::DensificationParam & param);
+
+  bool enqueuePointCloud(
+    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
+
+  double getCurrentTimestamp() const { return current_timestamp_; }
+  Eigen::Affine3f getAffineWorldToCurrent() const { return affine_world2current_; }
+  std::list<PointCloudWithTransform>::iterator getPointCloudCacheIter()
+  {
+    return pointcloud_cache_.begin();
+  }
+  bool isCacheEnd(std::list<PointCloudWithTransform>::iterator iter)
+  {
+    return iter == pointcloud_cache_.end();
+  }
+  unsigned int pointcloud_cache_size() const { return param_.pointcloud_cache_size(); }
+
+private:
+  void enqueue(const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine);
+  void dequeue();
+
+  centerpoint::DensificationParam param_;
+  double current_timestamp_{0.0};
+  Eigen::Affine3f affine_world2current_;
+  std::list<PointCloudWithTransform> pointcloud_cache_;
+};
+
+}  // namespace image_projection_based_fusion
+
+#endif  // IMAGE_PROJECTION_BASED_FUSION__POINTPAINTING_FUSION__POINTCLOUD_DENSIFICATION_HPP_
diff --git a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp
index d5892de6d7edb..c8318f79bbeed 100644
--- a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp
+++ b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp
@@ -23,7 +23,6 @@
 
 namespace image_projection_based_fusion
 {
-static constexpr size_t CAPACITY_POINT = 2000000;
 class PointPaintingTRT : public centerpoint::CenterPointTRT
 {
 public:
diff --git a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp
old mode 100755
new mode 100644
index d0f44b9d58706..4cdca8e49ac7e
--- a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp
+++ b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp
@@ -15,19 +15,35 @@
 #ifndef IMAGE_PROJECTION_BASED_FUSION__POINTPAINTING_FUSION__VOXEL_GENERATOR_HPP_
 #define IMAGE_PROJECTION_BASED_FUSION__POINTPAINTING_FUSION__VOXEL_GENERATOR_HPP_
 
+#include <image_projection_based_fusion/pointpainting_fusion/pointcloud_densification.hpp>
+#include <lidar_centerpoint/preprocess/pointcloud_densification.hpp>
 #include <lidar_centerpoint/preprocess/voxel_generator.hpp>
 
 #include <bitset>
+#include <memory>
 #include <vector>
 
 namespace image_projection_based_fusion
 {
-class VoxelGenerator : public centerpoint::VoxelGenerator
+
+class VoxelGenerator
 {
 public:
-  using centerpoint::VoxelGenerator::VoxelGenerator;
+  explicit VoxelGenerator(
+    const centerpoint::DensificationParam & param, const centerpoint::CenterPointConfig & config);
+
+  bool enqueuePointCloud(
+    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
+
+  std::size_t generateSweepPoints(std::vector<float> & points);
+
+protected:
+  std::unique_ptr<PointCloudDensification> pd_ptr_{nullptr};
 
-  std::size_t generateSweepPoints(std::vector<float> & points) override;
+  centerpoint::CenterPointConfig config_;
+  std::array<float, 6> range_;
+  std::array<int, 3> grid_size_;
+  std::array<float, 3> recip_voxel_size_;
 };
 }  // namespace image_projection_based_fusion
 
diff --git a/perception/image_projection_based_fusion/src/pointpainting_fusion/node.cpp b/perception/image_projection_based_fusion/src/pointpainting_fusion/node.cpp
index 3fb2aca2171f7..4df660e65a8c1 100644
--- a/perception/image_projection_based_fusion/src/pointpainting_fusion/node.cpp
+++ b/perception/image_projection_based_fusion/src/pointpainting_fusion/node.cpp
@@ -110,6 +110,7 @@ PointPaintingFusionNode::PointPaintingFusionNode(const rclcpp::NodeOptions & opt
     this->declare_parameter<int>("densification_params.num_past_frames");
   // network param
   const std::string trt_precision = this->declare_parameter<std::string>("trt_precision");
+  const std::size_t cloud_capacity = this->declare_parameter<std::int64_t>("cloud_capacity");
   const std::string encoder_onnx_path = this->declare_parameter<std::string>("encoder_onnx_path");
   const std::string encoder_engine_path =
     this->declare_parameter<std::string>("encoder_engine_path");
@@ -188,9 +189,9 @@ PointPaintingFusionNode::PointPaintingFusionNode(const rclcpp::NodeOptions & opt
   centerpoint::DensificationParam densification_param(
     densification_world_frame_id, densification_num_past_frames);
   centerpoint::CenterPointConfig config(
-    class_names_.size(), point_feature_size, max_voxel_size, pointcloud_range, voxel_size,
-    downsample_factor, encoder_in_feature_size, score_threshold, circle_nms_dist_threshold,
-    yaw_norm_thresholds, has_variance_);
+    class_names_.size(), point_feature_size, cloud_capacity, max_voxel_size, pointcloud_range,
+    voxel_size, downsample_factor, encoder_in_feature_size, score_threshold,
+    circle_nms_dist_threshold, yaw_norm_thresholds, has_variance_);
 
   // create detector
   detector_ptr_ = std::make_unique<image_projection_based_fusion::PointPaintingTRT>(
diff --git a/perception/image_projection_based_fusion/src/pointpainting_fusion/pointcloud_densification.cpp b/perception/image_projection_based_fusion/src/pointpainting_fusion/pointcloud_densification.cpp
new file mode 100644
index 0000000000000..9edf6a73cd59b
--- /dev/null
+++ b/perception/image_projection_based_fusion/src/pointpainting_fusion/pointcloud_densification.cpp
@@ -0,0 +1,103 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "image_projection_based_fusion/pointpainting_fusion/pointcloud_densification.hpp"
+
+#include <pcl_ros/transforms.hpp>
+
+#include <boost/optional.hpp>
+
+#include <pcl_conversions/pcl_conversions.h>
+#ifdef ROS_DISTRO_GALACTIC
+#include <tf2_eigen/tf2_eigen.h>
+#else
+#include <tf2_eigen/tf2_eigen.hpp>
+#endif
+
+#include <string>
+#include <utility>
+
+namespace
+{
+boost::optional<geometry_msgs::msg::Transform> getTransform(
+  const tf2_ros::Buffer & tf_buffer, const std::string & target_frame_id,
+  const std::string & source_frame_id, const rclcpp::Time & time)
+{
+  try {
+    geometry_msgs::msg::TransformStamped transform_stamped;
+    transform_stamped = tf_buffer.lookupTransform(
+      target_frame_id, source_frame_id, time, rclcpp::Duration::from_seconds(0.5));
+    return transform_stamped.transform;
+  } catch (tf2::TransformException & ex) {
+    RCLCPP_WARN_STREAM(rclcpp::get_logger("lidar_centerpoint"), ex.what());
+    return boost::none;
+  }
+}
+
+Eigen::Affine3f transformToEigen(const geometry_msgs::msg::Transform & t)
+{
+  Eigen::Affine3f a;
+  a.matrix() = tf2::transformToEigen(t).matrix().cast<float>();
+  return a;
+}
+
+}  // namespace
+
+namespace image_projection_based_fusion
+{
+PointCloudDensification::PointCloudDensification(const centerpoint::DensificationParam & param)
+: param_(param)
+{
+}
+
+bool PointCloudDensification::enqueuePointCloud(
+  const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
+{
+  const auto header = pointcloud_msg.header;
+
+  if (param_.pointcloud_cache_size() > 1) {
+    auto transform_world2current =
+      getTransform(tf_buffer, header.frame_id, param_.world_frame_id(), header.stamp);
+    if (!transform_world2current) {
+      return false;
+    }
+    auto affine_world2current = transformToEigen(transform_world2current.get());
+
+    enqueue(pointcloud_msg, affine_world2current);
+  } else {
+    enqueue(pointcloud_msg, Eigen::Affine3f::Identity());
+  }
+
+  dequeue();
+
+  return true;
+}
+
+void PointCloudDensification::enqueue(
+  const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current)
+{
+  affine_world2current_ = affine_world2current;
+  current_timestamp_ = rclcpp::Time(msg.header.stamp).seconds();
+  PointCloudWithTransform pointcloud = {msg, affine_world2current.inverse()};
+  pointcloud_cache_.push_front(pointcloud);
+}
+
+void PointCloudDensification::dequeue()
+{
+  if (pointcloud_cache_.size() > param_.pointcloud_cache_size()) {
+    pointcloud_cache_.pop_back();
+  }
+}
+
+}  // namespace image_projection_based_fusion
diff --git a/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp b/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp
index ea10cb1237436..cb3fc33d3e022 100755
--- a/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp
+++ b/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp
@@ -18,6 +18,32 @@
 
 namespace image_projection_based_fusion
 {
+
+VoxelGenerator::VoxelGenerator(
+  const centerpoint::DensificationParam & param, const centerpoint::CenterPointConfig & config)
+: config_(config)
+{
+  pd_ptr_ = std::make_unique<PointCloudDensification>(param);
+  range_[0] = config.range_min_x_;
+  range_[1] = config.range_min_y_;
+  range_[2] = config.range_min_z_;
+  range_[3] = config.range_max_x_;
+  range_[4] = config.range_max_y_;
+  range_[5] = config.range_max_z_;
+  grid_size_[0] = config.grid_size_x_;
+  grid_size_[1] = config.grid_size_y_;
+  grid_size_[2] = config.grid_size_z_;
+  recip_voxel_size_[0] = 1 / config.voxel_size_x_;
+  recip_voxel_size_[1] = 1 / config.voxel_size_y_;
+  recip_voxel_size_[2] = 1 / config.voxel_size_z_;
+}
+
+bool VoxelGenerator::enqueuePointCloud(
+  const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
+{
+  return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
+}
+
 size_t VoxelGenerator::generateSweepPoints(std::vector<float> & points)
 {
   Eigen::Vector3f point_current, point_past;
diff --git a/perception/lidar_centerpoint/README.md b/perception/lidar_centerpoint/README.md
index fa42655b6ca1f..d7e81f7add6ea 100644
--- a/perception/lidar_centerpoint/README.md
+++ b/perception/lidar_centerpoint/README.md
@@ -28,6 +28,22 @@ We trained the models using <https://github.com/open-mmlab/mmdetection3d>.
 
 ## Parameters
 
+### ML Model Parameters
+
+Note that these parameters are associated with ONNX file, predefined during the training phase. Be careful to change ONNX file as well when changing this parameter. Also, whenever you update the ONNX file, do NOT forget to check these values.
+
+| Name                                   | Type         | Default Value                                    | Description                                                           |
+| -------------------------------------- | ------------ | ------------------------------------------------ | --------------------------------------------------------------------- |
+| `model_params.class_names`             | list[string] | ["CAR", "TRUCK", "BUS", "BICYCLE", "PEDESTRIAN"] | list of class names for model outputs                                 |
+| `model_params.point_feature_size`      | int          | `4`                                              | number of features per point in the point cloud                       |
+| `model_params.max_voxel_size`          | int          | `40000`                                          | maximum number of voxels                                              |
+| `model_params.point_cloud_range`       | list[double] | [-76.8, -76.8, -4.0, 76.8, 76.8, 6.0]            | detection range [min_x, min_y, min_z, max_x, max_y, max_z] [m]        |
+| `model_params.voxel_size`              | list[double] | [0.32, 0.32, 10.0]                               | size of each voxel [x, y, z] [m]                                      |
+| `model_params.downsample_factor`       | int          | `1`                                              | downsample factor for coordinates                                     |
+| `model_params.encoder_in_feature_size` | int          | `9`                                              | number of input features to the encoder                               |
+| `model_params.has_variance`            | bool         | `false`                                          | true if the model outputs pose variance as well as pose for each bbox |
+| `model_params.has_twist`               | bool         | `false`                                          | true if the model outputs velocity as well as pose for each bbox      |
+
 ### Core Parameters
 
 | Name                                             | Type         | Default Value             | Description                                                   |
diff --git a/perception/lidar_centerpoint/config/centerpoint.param.yaml b/perception/lidar_centerpoint/config/centerpoint.param.yaml
index 17ed9a447cdb8..53d77e8d1c42c 100644
--- a/perception/lidar_centerpoint/config/centerpoint.param.yaml
+++ b/perception/lidar_centerpoint/config/centerpoint.param.yaml
@@ -6,6 +6,7 @@
     head_onnx_path: "$(var model_path)/pts_backbone_neck_head_$(var model_name).onnx"
     head_engine_path: "$(var model_path)/pts_backbone_neck_head_$(var model_name).engine"
     trt_precision: fp16
+    cloud_capacity: 2000000
     post_process_params:
       # post-process params
       circle_nms_dist_threshold: 0.5
diff --git a/perception/lidar_centerpoint/config/centerpoint_sigma.param.yaml b/perception/lidar_centerpoint/config/centerpoint_sigma.param.yaml
new file mode 100644
index 0000000000000..bd5fc5f3567a5
--- /dev/null
+++ b/perception/lidar_centerpoint/config/centerpoint_sigma.param.yaml
@@ -0,0 +1,21 @@
+/**:
+  ros__parameters:
+
+    # weight files
+    encoder_onnx_path: "$(var model_path)/pts_voxel_encoder_$(var model_name).onnx"
+    encoder_engine_path: "$(var model_path)/pts_voxel_encoder_$(var model_name).engine"
+    head_onnx_path: "$(var model_path)/pts_backbone_neck_head_$(var model_name).onnx"
+    head_engine_path: "$(var model_path)/pts_backbone_neck_head_$(var model_name).engine"
+    trt_precision: fp16
+    cloud_capacity: 2000000
+    post_process_params:
+      # post-process params
+      circle_nms_dist_threshold: 0.5
+      iou_nms_target_class_names: ["CAR"]
+      iou_nms_search_distance_2d: 10.0
+      iou_nms_threshold: 0.1
+      yaw_norm_thresholds: [0.3, 0.3, 0.3, 0.3, 0.0]
+      score_threshold: 0.35
+    densification_params:
+      world_frame_id: map
+      num_past_frames: 1
diff --git a/perception/lidar_centerpoint/config/centerpoint_tiny.param.yaml b/perception/lidar_centerpoint/config/centerpoint_tiny.param.yaml
index 17ed9a447cdb8..53d77e8d1c42c 100644
--- a/perception/lidar_centerpoint/config/centerpoint_tiny.param.yaml
+++ b/perception/lidar_centerpoint/config/centerpoint_tiny.param.yaml
@@ -6,6 +6,7 @@
     head_onnx_path: "$(var model_path)/pts_backbone_neck_head_$(var model_name).onnx"
     head_engine_path: "$(var model_path)/pts_backbone_neck_head_$(var model_name).engine"
     trt_precision: fp16
+    cloud_capacity: 2000000
     post_process_params:
       # post-process params
       circle_nms_dist_threshold: 0.5
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_config.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_config.hpp
index 19fdbe7a8b9c2..7b560cf46e2e3 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_config.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_config.hpp
@@ -24,14 +24,16 @@ class CenterPointConfig
 {
 public:
   explicit CenterPointConfig(
-    const std::size_t class_size, const float point_feature_size, const std::size_t max_voxel_size,
-    const std::vector<double> & point_cloud_range, const std::vector<double> & voxel_size,
-    const std::size_t downsample_factor, const std::size_t encoder_in_feature_size,
-    const float score_threshold, const float circle_nms_dist_threshold,
-    const std::vector<double> yaw_norm_thresholds, const bool has_variance)
+    const std::size_t class_size, const float point_feature_size, const std::size_t cloud_capacity,
+    const std::size_t max_voxel_size, const std::vector<double> & point_cloud_range,
+    const std::vector<double> & voxel_size, const std::size_t downsample_factor,
+    const std::size_t encoder_in_feature_size, const float score_threshold,
+    const float circle_nms_dist_threshold, const std::vector<double> yaw_norm_thresholds,
+    const bool has_variance)
   {
     class_size_ = class_size;
     point_feature_size_ = point_feature_size;
+    cloud_capacity_ = cloud_capacity;
     max_voxel_size_ = max_voxel_size;
     if (point_cloud_range.size() == 6) {
       range_min_x_ = static_cast<float>(point_cloud_range[0]);
@@ -85,6 +87,7 @@ class CenterPointConfig
   };
 
   // input params
+  std::size_t cloud_capacity_{};
   std::size_t class_size_{3};
   const std::size_t point_dim_size_{3};  // x, y and z
   std::size_t point_feature_size_{4};    // x, y, z and time-lag
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
index 383e8ef719d4d..cbbf01c1e6405 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
@@ -15,15 +15,14 @@
 #ifndef LIDAR_CENTERPOINT__CENTERPOINT_TRT_HPP_
 #define LIDAR_CENTERPOINT__CENTERPOINT_TRT_HPP_
 
-#include <lidar_centerpoint/cuda_utils.hpp>
-#include <lidar_centerpoint/network/network_trt.hpp>
-#include <lidar_centerpoint/postprocess/postprocess_kernel.hpp>
-#include <lidar_centerpoint/preprocess/voxel_generator.hpp>
+#include "lidar_centerpoint/cuda_utils.hpp"
+#include "lidar_centerpoint/network/network_trt.hpp"
+#include "lidar_centerpoint/postprocess/postprocess_kernel.hpp"
+#include "lidar_centerpoint/preprocess/voxel_generator.hpp"
+#include "pcl/point_cloud.h"
+#include "pcl/point_types.h"
 
-#include <sensor_msgs/msg/point_cloud2.hpp>
-
-#include <pcl/point_cloud.h>
-#include <pcl/point_types.h>
+#include "sensor_msgs/msg/point_cloud2.hpp"
 
 #include <memory>
 #include <string>
@@ -32,8 +31,6 @@
 
 namespace centerpoint
 {
-static constexpr size_t CAPACITY_POINT = 2000000;
-
 class NetworkParam
 {
 public:
@@ -105,9 +102,11 @@ class CenterPointTRT
   cuda::unique_ptr<float[]> head_out_rot_d_{nullptr};
   cuda::unique_ptr<float[]> head_out_vel_d_{nullptr};
   cuda::unique_ptr<float[]> points_d_{nullptr};
+  cuda::unique_ptr<float[]> points_aux_d_{nullptr};
   cuda::unique_ptr<float[]> voxels_buffer_d_{nullptr};
   cuda::unique_ptr<unsigned int[]> mask_d_{nullptr};
   cuda::unique_ptr<unsigned int[]> num_voxels_d_{nullptr};
+  cuda::unique_ptr<unsigned int[]> shuffle_indices_d_{nullptr};
 };
 
 }  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/cuda_utils.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/cuda_utils.hpp
index 5ab26d75a0a41..494d499dc2924 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/cuda_utils.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/cuda_utils.hpp
@@ -42,7 +42,7 @@
 #ifndef LIDAR_CENTERPOINT__CUDA_UTILS_HPP_
 #define LIDAR_CENTERPOINT__CUDA_UTILS_HPP_
 
-#include <cuda_runtime_api.h>
+#include "cuda_runtime_api.h"
 
 #include <memory>
 #include <sstream>
@@ -89,13 +89,13 @@ cuda::unique_ptr<T> make_unique()
   return cuda::unique_ptr<T>{p};
 }
 
-constexpr size_t CUDA_ALIGN = 256;
+constexpr std::size_t CUDA_ALIGN = 256;
 
 template <typename T>
-inline size_t get_size_aligned(size_t num_elem)
+inline std::size_t get_size_aligned(std::size_t num_elem)
 {
-  size_t size = num_elem * sizeof(T);
-  size_t extra_align = 0;
+  std::size_t size = num_elem * sizeof(T);
+  std::size_t extra_align = 0;
   if (size % CUDA_ALIGN != 0) {
     extra_align = CUDA_ALIGN - size % CUDA_ALIGN;
   }
@@ -103,9 +103,9 @@ inline size_t get_size_aligned(size_t num_elem)
 }
 
 template <typename T>
-inline T * get_next_ptr(size_t num_elem, void *& workspace, size_t & workspace_size)
+inline T * get_next_ptr(std::size_t num_elem, void *& workspace, std::size_t & workspace_size)
 {
-  size_t size = get_size_aligned<T>(num_elem);
+  std::size_t size = get_size_aligned<T>(num_elem);
   if (size > workspace_size) {
     throw ::std::runtime_error("Workspace is too small!");
   }
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/detection_class_remapper.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/detection_class_remapper.hpp
index cd1a0e58e115e..d961dd998af76 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/detection_class_remapper.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/detection_class_remapper.hpp
@@ -17,10 +17,10 @@
 
 #include <Eigen/Core>
 
-#include <autoware_perception_msgs/msg/detected_object_kinematics.hpp>
-#include <autoware_perception_msgs/msg/detected_objects.hpp>
-#include <autoware_perception_msgs/msg/object_classification.hpp>
-#include <autoware_perception_msgs/msg/shape.hpp>
+#include "autoware_perception_msgs/msg/detected_object_kinematics.hpp"
+#include "autoware_perception_msgs/msg/detected_objects.hpp"
+#include "autoware_perception_msgs/msg/object_classification.hpp"
+#include "autoware_perception_msgs/msg/shape.hpp"
 
 #include <vector>
 
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/network/network_trt.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/network/network_trt.hpp
index 639aa0ba8bad3..1bf77248efe08 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/network/network_trt.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/network/network_trt.hpp
@@ -15,8 +15,8 @@
 #ifndef LIDAR_CENTERPOINT__NETWORK__NETWORK_TRT_HPP_
 #define LIDAR_CENTERPOINT__NETWORK__NETWORK_TRT_HPP_
 
-#include <lidar_centerpoint/centerpoint_config.hpp>
-#include <lidar_centerpoint/network/tensorrt_wrapper.hpp>
+#include "lidar_centerpoint/centerpoint_config.hpp"
+#include "lidar_centerpoint/network/tensorrt_wrapper.hpp"
 
 #include <vector>
 
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp
index 3dd00c25fd9e7..ad5e222ce01e8 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp
@@ -15,8 +15,8 @@
 #ifndef LIDAR_CENTERPOINT__NETWORK__SCATTER_KERNEL_HPP_
 #define LIDAR_CENTERPOINT__NETWORK__SCATTER_KERNEL_HPP_
 
-#include <cuda.h>
-#include <cuda_runtime_api.h>
+#include "cuda.h"
+#include "cuda_runtime_api.h"
 
 namespace centerpoint
 {
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/network/tensorrt_wrapper.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/network/tensorrt_wrapper.hpp
index 7f55ab6f5e414..7415dc192d7ab 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/network/tensorrt_wrapper.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/network/tensorrt_wrapper.hpp
@@ -15,10 +15,9 @@
 #ifndef LIDAR_CENTERPOINT__NETWORK__TENSORRT_WRAPPER_HPP_
 #define LIDAR_CENTERPOINT__NETWORK__TENSORRT_WRAPPER_HPP_
 
-#include <lidar_centerpoint/centerpoint_config.hpp>
-#include <tensorrt_common/tensorrt_common.hpp>
-
-#include <NvInfer.h>
+#include "NvInfer.h"
+#include "lidar_centerpoint/centerpoint_config.hpp"
+#include "tensorrt_common/tensorrt_common.hpp"
 
 #include <iostream>
 #include <memory>
@@ -32,7 +31,7 @@ class TensorRTWrapper
 public:
   explicit TensorRTWrapper(const CenterPointConfig & config);
 
-  ~TensorRTWrapper();
+  virtual ~TensorRTWrapper();
 
   bool init(
     const std::string & onnx_path, const std::string & engine_path, const std::string & precision);
@@ -50,7 +49,7 @@ class TensorRTWrapper
 private:
   bool parseONNX(
     const std::string & onnx_path, const std::string & engine_path, const std::string & precision,
-    size_t workspace_size = (1ULL << 30));
+    std::size_t workspace_size = (1ULL << 30));
 
   bool saveEngine(const std::string & engine_path);
 
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp
index 4ccfa908e12d7..ddc1a072b76d0 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp
@@ -15,13 +15,13 @@
 #ifndef LIDAR_CENTERPOINT__NODE_HPP_
 #define LIDAR_CENTERPOINT__NODE_HPP_
 
+#include "lidar_centerpoint/centerpoint_trt.hpp"
+#include "lidar_centerpoint/detection_class_remapper.hpp"
 #include "lidar_centerpoint/postprocess/non_maximum_suppression.hpp"
 
 #include <autoware/universe_utils/ros/debug_publisher.hpp>
 #include <autoware/universe_utils/ros/published_time_publisher.hpp>
 #include <autoware/universe_utils/system/stop_watch.hpp>
-#include <lidar_centerpoint/centerpoint_trt.hpp>
-#include <lidar_centerpoint/detection_class_remapper.hpp>
 #include <rclcpp/rclcpp.hpp>
 
 #include <autoware_perception_msgs/msg/detected_object_kinematics.hpp>
@@ -51,7 +51,6 @@ class LidarCenterPointNode : public rclcpp::Node
   rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::SharedPtr pointcloud_sub_;
   rclcpp::Publisher<autoware_perception_msgs::msg::DetectedObjects>::SharedPtr objects_pub_;
 
-  float score_threshold_{0.0};
   std::vector<std::string> class_names_;
   bool has_variance_{false};
   bool has_twist_{false};
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/circle_nms_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/circle_nms_kernel.hpp
index cd2d71d33806d..5b55636a3eec4 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/circle_nms_kernel.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/circle_nms_kernel.hpp
@@ -15,9 +15,8 @@
 #ifndef LIDAR_CENTERPOINT__POSTPROCESS__CIRCLE_NMS_KERNEL_HPP_
 #define LIDAR_CENTERPOINT__POSTPROCESS__CIRCLE_NMS_KERNEL_HPP_
 
-#include <lidar_centerpoint/utils.hpp>
-
-#include <thrust/device_vector.h>
+#include "lidar_centerpoint/utils.hpp"
+#include "thrust/device_vector.h"
 
 namespace centerpoint
 {
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/postprocess_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/postprocess_kernel.hpp
index d790e896dea9a..58431b48eb917 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/postprocess_kernel.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/postprocess_kernel.hpp
@@ -15,12 +15,10 @@
 #ifndef LIDAR_CENTERPOINT__POSTPROCESS__POSTPROCESS_KERNEL_HPP_
 #define LIDAR_CENTERPOINT__POSTPROCESS__POSTPROCESS_KERNEL_HPP_
 
-#include <lidar_centerpoint/centerpoint_config.hpp>
-#include <lidar_centerpoint/utils.hpp>
-
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <thrust/device_vector.h>
+#include "cuda.h"
+#include "cuda_runtime_api.h"
+#include "lidar_centerpoint/centerpoint_config.hpp"
+#include "lidar_centerpoint/utils.hpp"
 
 #include <vector>
 
@@ -38,8 +36,6 @@ class PostProcessCUDA
 
 private:
   CenterPointConfig config_;
-  thrust::device_vector<Box3D> boxes3d_d_;
-  thrust::device_vector<float> yaw_norm_thresholds_d_;
 };
 
 }  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/pointcloud_densification.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/pointcloud_densification.hpp
index 209fc74e6c58f..5b7b3beb343cc 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/pointcloud_densification.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/pointcloud_densification.hpp
@@ -15,17 +15,19 @@
 #ifndef LIDAR_CENTERPOINT__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
 #define LIDAR_CENTERPOINT__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
 
-#include <tf2_ros/buffer.h>
-#include <tf2_ros/transform_listener.h>
-#ifdef ROS_DISTRO_GALACTIC
-#include <tf2_sensor_msgs/tf2_sensor_msgs.h>
-#else
-#include <tf2_sensor_msgs/tf2_sensor_msgs.hpp>
-#endif
+#include "tf2_ros/buffer.h"
+#include "tf2_ros/transform_listener.h"
 
 #include <list>
 #include <string>
 #include <utility>
+#ifdef ROS_DISTRO_GALACTIC
+#include "tf2_sensor_msgs/tf2_sensor_msgs.h"
+#else
+#include "tf2_sensor_msgs/tf2_sensor_msgs.hpp"
+#endif
+
+#include "lidar_centerpoint/cuda_utils.hpp"
 
 namespace centerpoint
 {
@@ -48,7 +50,10 @@ class DensificationParam
 
 struct PointCloudWithTransform
 {
-  sensor_msgs::msg::PointCloud2 pointcloud_msg;
+  cuda::unique_ptr<float[]> points_d{nullptr};
+  std_msgs::msg::Header header;
+  std::size_t num_points{0};
+  std::size_t point_step{0};
   Eigen::Affine3f affine_past2world;
 };
 
@@ -58,7 +63,8 @@ class PointCloudDensification
   explicit PointCloudDensification(const DensificationParam & param);
 
   bool enqueuePointCloud(
-    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
+    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
+    cudaStream_t stream);
 
   double getCurrentTimestamp() const { return current_timestamp_; }
   Eigen::Affine3f getAffineWorldToCurrent() const { return affine_world2current_; }
@@ -73,7 +79,8 @@ class PointCloudDensification
   unsigned int pointcloud_cache_size() const { return param_.pointcloud_cache_size(); }
 
 private:
-  void enqueue(const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine);
+  void enqueue(
+    const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine, cudaStream_t stream);
   void dequeue();
 
   DensificationParam param_;
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
index 9488b67475509..62495d655723e 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
@@ -15,17 +15,26 @@
 #ifndef LIDAR_CENTERPOINT__PREPROCESS__PREPROCESS_KERNEL_HPP_
 #define LIDAR_CENTERPOINT__PREPROCESS__PREPROCESS_KERNEL_HPP_
 
-#include <cuda.h>
-#include <cuda_runtime_api.h>
+#include "cuda.h"
+#include "cuda_runtime_api.h"
 
 namespace centerpoint
 {
-cudaError_t generateVoxels_random_launch(
-  const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
-  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
-  float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels,
+cudaError_t generateSweepPoints_launch(
+  const float * input_points, std::size_t points_size, int input_point_step, float time_lag,
+  const float * transform, int num_features, float * output_points, cudaStream_t stream);
+
+cudaError_t shufflePoints_launch(
+  const float * points, const unsigned int * indices, float * shuffled_points,
+  const std::size_t points_size, const std::size_t max_size, const std::size_t offset,
   cudaStream_t stream);
 
+cudaError_t generateVoxels_random_launch(
+  const float * points, std::size_t points_size, float min_x_range, float max_x_range,
+  float min_y_range, float max_y_range, float min_z_range, float max_z_range, float pillar_x_size,
+  float pillar_y_size, float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask,
+  float * voxels, cudaStream_t stream);
+
 cudaError_t generateBaseFeatures_launch(
   unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, int max_voxel_size,
   unsigned int * pillar_num, float * voxel_features, float * voxel_num, int * voxel_idxs,
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp
index be15d51e91715..3ade5e677cdbe 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp
@@ -15,10 +15,10 @@
 #ifndef LIDAR_CENTERPOINT__PREPROCESS__VOXEL_GENERATOR_HPP_
 #define LIDAR_CENTERPOINT__PREPROCESS__VOXEL_GENERATOR_HPP_
 
-#include <lidar_centerpoint/centerpoint_config.hpp>
-#include <lidar_centerpoint/preprocess/pointcloud_densification.hpp>
+#include "lidar_centerpoint/centerpoint_config.hpp"
+#include "lidar_centerpoint/preprocess/pointcloud_densification.hpp"
 
-#include <sensor_msgs/msg/point_cloud2.hpp>
+#include "sensor_msgs/msg/point_cloud2.hpp"
 
 #include <memory>
 #include <vector>
@@ -31,10 +31,11 @@ class VoxelGeneratorTemplate
   explicit VoxelGeneratorTemplate(
     const DensificationParam & param, const CenterPointConfig & config);
 
-  virtual std::size_t generateSweepPoints(std::vector<float> & points) = 0;
+  virtual std::size_t generateSweepPoints(float * d_points, cudaStream_t stream) = 0;
 
   bool enqueuePointCloud(
-    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
+    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
+    cudaStream_t stream);
 
 protected:
   std::unique_ptr<PointCloudDensification> pd_ptr_{nullptr};
@@ -50,7 +51,7 @@ class VoxelGenerator : public VoxelGeneratorTemplate
 public:
   using VoxelGeneratorTemplate::VoxelGeneratorTemplate;
 
-  std::size_t generateSweepPoints(std::vector<float> & points) override;
+  std::size_t generateSweepPoints(float * d_points, cudaStream_t stream) override;
 };
 
 }  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/ros_utils.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/ros_utils.hpp
index e33d86cd0aba7..102359e1a43f4 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/ros_utils.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/ros_utils.hpp
@@ -17,12 +17,9 @@
 
 // ros packages cannot be included from cuda.
 
-#include <lidar_centerpoint/utils.hpp>
+#include "lidar_centerpoint/utils.hpp"
 
-#include <autoware_perception_msgs/msg/detected_object_kinematics.hpp>
-#include <autoware_perception_msgs/msg/detected_objects.hpp>
-#include <autoware_perception_msgs/msg/object_classification.hpp>
-#include <autoware_perception_msgs/msg/shape.hpp>
+#include "autoware_perception_msgs/msg/detected_objects.hpp"
 
 #include <string>
 #include <vector>
@@ -38,7 +35,7 @@ uint8_t getSemanticType(const std::string & class_name);
 
 std::array<double, 36> convertPoseCovarianceMatrix(const Box3D & box3d);
 
-std::array<double, 36> convertTwistCovarianceMatrix(const Box3D & box3d);
+std::array<double, 36> convertTwistCovarianceMatrix(const Box3D & box3d, const float yaw);
 
 bool isCarLikeVehicleLabel(const uint8_t label);
 
diff --git a/perception/lidar_centerpoint/lib/centerpoint_trt.cpp b/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
index a4281aa8260d7..a80d00a1f004f 100644
--- a/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
+++ b/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
@@ -14,13 +14,18 @@
 
 #include "lidar_centerpoint/centerpoint_trt.hpp"
 
+#include "lidar_centerpoint/centerpoint_config.hpp"
+#include "lidar_centerpoint/network/scatter_kernel.hpp"
+#include "lidar_centerpoint/preprocess/preprocess_kernel.hpp"
+
 #include <autoware/universe_utils/math/constants.hpp>
-#include <lidar_centerpoint/centerpoint_config.hpp>
-#include <lidar_centerpoint/network/scatter_kernel.hpp>
-#include <lidar_centerpoint/preprocess/preprocess_kernel.hpp>
 
+#include <algorithm>
+#include <cstdlib>
+#include <ctime>
 #include <iostream>
 #include <memory>
+#include <random>
 #include <string>
 #include <vector>
 
@@ -84,7 +89,7 @@ void CenterPointTRT::initPtr()
   mask_size_ = config_.grid_size_x_ * config_.grid_size_y_;
 
   // host
-  points_.resize(CAPACITY_POINT * config_.point_feature_size_);
+  points_.resize(config_.cloud_capacity_ * config_.point_feature_size_);
 
   // device
   voxels_d_ = cuda::make_unique<float[]>(voxels_size_);
@@ -99,10 +104,25 @@ void CenterPointTRT::initPtr()
   head_out_dim_d_ = cuda::make_unique<float[]>(grid_xy_size * config_.head_out_dim_size_);
   head_out_rot_d_ = cuda::make_unique<float[]>(grid_xy_size * config_.head_out_rot_size_);
   head_out_vel_d_ = cuda::make_unique<float[]>(grid_xy_size * config_.head_out_vel_size_);
-  points_d_ = cuda::make_unique<float[]>(CAPACITY_POINT * config_.point_feature_size_);
+  points_d_ = cuda::make_unique<float[]>(config_.cloud_capacity_ * config_.point_feature_size_);
   voxels_buffer_d_ = cuda::make_unique<float[]>(voxels_buffer_size_);
   mask_d_ = cuda::make_unique<unsigned int[]>(mask_size_);
   num_voxels_d_ = cuda::make_unique<unsigned int[]>(1);
+
+  points_aux_d_ = cuda::make_unique<float[]>(config_.cloud_capacity_ * config_.point_feature_size_);
+  shuffle_indices_d_ = cuda::make_unique<unsigned int[]>(config_.cloud_capacity_);
+
+  std::vector<unsigned int> indexes(config_.cloud_capacity_);
+  std::iota(indexes.begin(), indexes.end(), 0);
+
+  std::default_random_engine e(0);
+  std::shuffle(indexes.begin(), indexes.end(), e);
+
+  std::srand(std::time(nullptr));
+
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    shuffle_indices_d_.get(), indexes.data(), config_.cloud_capacity_ * sizeof(unsigned int),
+    cudaMemcpyHostToDevice, stream_));
 }
 
 bool CenterPointTRT::detect(
@@ -143,14 +163,17 @@ bool CenterPointTRT::detect(
 bool CenterPointTRT::preprocess(
   const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
 {
-  bool is_success = vg_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
+  bool is_success = vg_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer, stream_);
   if (!is_success) {
     return false;
   }
-  const auto count = vg_ptr_->generateSweepPoints(points_);
-  CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    points_d_.get(), points_.data(), count * config_.point_feature_size_ * sizeof(float),
-    cudaMemcpyHostToDevice, stream_));
+
+  const std::size_t count = vg_ptr_->generateSweepPoints(points_aux_d_.get(), stream_);
+  const std::size_t random_offset = std::rand() % config_.cloud_capacity_;
+  CHECK_CUDA_ERROR(shufflePoints_launch(
+    points_aux_d_.get(), shuffle_indices_d_.get(), points_d_.get(), count, config_.cloud_capacity_,
+    random_offset, stream_));
+
   CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_));
   CHECK_CUDA_ERROR(
     cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_ * sizeof(float), stream_));
@@ -162,10 +185,10 @@ bool CenterPointTRT::preprocess(
     num_points_per_voxel_d_.get(), 0, config_.max_voxel_size_ * sizeof(float), stream_));
 
   CHECK_CUDA_ERROR(generateVoxels_random_launch(
-    points_d_.get(), count, config_.range_min_x_, config_.range_max_x_, config_.range_min_y_,
-    config_.range_max_y_, config_.range_min_z_, config_.range_max_z_, config_.voxel_size_x_,
-    config_.voxel_size_y_, config_.voxel_size_z_, config_.grid_size_y_, config_.grid_size_x_,
-    mask_d_.get(), voxels_buffer_d_.get(), stream_));
+    points_d_.get(), config_.cloud_capacity_, config_.range_min_x_, config_.range_max_x_,
+    config_.range_min_y_, config_.range_max_y_, config_.range_min_z_, config_.range_max_z_,
+    config_.voxel_size_x_, config_.voxel_size_y_, config_.voxel_size_z_, config_.grid_size_y_,
+    config_.grid_size_x_, mask_d_.get(), voxels_buffer_d_.get(), stream_));
 
   CHECK_CUDA_ERROR(generateBaseFeatures_launch(
     mask_d_.get(), voxels_buffer_d_.get(), config_.grid_size_y_, config_.grid_size_x_,
diff --git a/perception/lidar_centerpoint/lib/detection_class_remapper.cpp b/perception/lidar_centerpoint/lib/detection_class_remapper.cpp
index ecfbb4360ecf0..9805fc7a661d1 100644
--- a/perception/lidar_centerpoint/lib/detection_class_remapper.cpp
+++ b/perception/lidar_centerpoint/lib/detection_class_remapper.cpp
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <lidar_centerpoint/detection_class_remapper.hpp>
+#include "lidar_centerpoint/detection_class_remapper.hpp"
 
 namespace centerpoint
 {
diff --git a/perception/lidar_centerpoint/lib/network/scatter_kernel.cu b/perception/lidar_centerpoint/lib/network/scatter_kernel.cu
index 09cc83bf606fe..f139e230426ca 100644
--- a/perception/lidar_centerpoint/lib/network/scatter_kernel.cu
+++ b/perception/lidar_centerpoint/lib/network/scatter_kernel.cu
@@ -13,8 +13,7 @@
 // limitations under the License.
 
 #include "lidar_centerpoint/network/scatter_kernel.hpp"
-
-#include <lidar_centerpoint/utils.hpp>
+#include "lidar_centerpoint/utils.hpp"
 
 namespace
 {
diff --git a/perception/lidar_centerpoint/lib/network/tensorrt_wrapper.cpp b/perception/lidar_centerpoint/lib/network/tensorrt_wrapper.cpp
index 91fcce9a80f78..3fc522eea15c9 100644
--- a/perception/lidar_centerpoint/lib/network/tensorrt_wrapper.cpp
+++ b/perception/lidar_centerpoint/lib/network/tensorrt_wrapper.cpp
@@ -14,7 +14,7 @@
 
 #include "lidar_centerpoint/network/tensorrt_wrapper.hpp"
 
-#include <NvOnnxParser.h>
+#include "NvOnnxParser.h"
 
 #include <fstream>
 #include <memory>
@@ -80,7 +80,7 @@ bool TensorRTWrapper::createContext()
 
 bool TensorRTWrapper::parseONNX(
   const std::string & onnx_path, const std::string & engine_path, const std::string & precision,
-  const size_t workspace_size)
+  const std::size_t workspace_size)
 {
   auto builder =
     tensorrt_common::TrtUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(logger_));
diff --git a/perception/lidar_centerpoint/lib/postprocess/circle_nms_kernel.cu b/perception/lidar_centerpoint/lib/postprocess/circle_nms_kernel.cu
index 398e75a55c44b..c208eefe135fb 100644
--- a/perception/lidar_centerpoint/lib/postprocess/circle_nms_kernel.cu
+++ b/perception/lidar_centerpoint/lib/postprocess/circle_nms_kernel.cu
@@ -21,12 +21,10 @@ Written by Shaoshuai Shi
 All Rights Reserved 2019-2020.
 */
 
+#include "lidar_centerpoint/cuda_utils.hpp"
 #include "lidar_centerpoint/postprocess/circle_nms_kernel.hpp"
-
-#include <lidar_centerpoint/cuda_utils.hpp>
-#include <lidar_centerpoint/utils.hpp>
-
-#include <thrust/host_vector.h>
+#include "lidar_centerpoint/utils.hpp"
+#include "thrust/host_vector.h"
 
 namespace
 {
diff --git a/perception/lidar_centerpoint/lib/postprocess/non_maximum_suppression.cpp b/perception/lidar_centerpoint/lib/postprocess/non_maximum_suppression.cpp
index d750e4adab0e1..30a54b08b8ede 100644
--- a/perception/lidar_centerpoint/lib/postprocess/non_maximum_suppression.cpp
+++ b/perception/lidar_centerpoint/lib/postprocess/non_maximum_suppression.cpp
@@ -14,9 +14,10 @@
 
 #include "lidar_centerpoint/postprocess/non_maximum_suppression.hpp"
 
-#include "autoware/universe_utils/geometry/geometry.hpp"
-#include "object_recognition_utils/geometry.hpp"
-#include "object_recognition_utils/object_recognition_utils.hpp"
+#include <autoware/universe_utils/geometry/geometry.hpp>
+#include <object_recognition_utils/geometry.hpp>
+#include <object_recognition_utils/object_recognition_utils.hpp>
+
 namespace centerpoint
 {
 using Label = autoware_perception_msgs::msg::ObjectClassification;
diff --git a/perception/lidar_centerpoint/lib/postprocess/postprocess_kernel.cu b/perception/lidar_centerpoint/lib/postprocess/postprocess_kernel.cu
index 34bbd2811041c..fb8e364b352fc 100644
--- a/perception/lidar_centerpoint/lib/postprocess/postprocess_kernel.cu
+++ b/perception/lidar_centerpoint/lib/postprocess/postprocess_kernel.cu
@@ -13,12 +13,10 @@
 // limitations under the License.
 
 #include "lidar_centerpoint/postprocess/circle_nms_kernel.hpp"
-
-#include <lidar_centerpoint/postprocess/postprocess_kernel.hpp>
-
-#include <thrust/count.h>
-#include <thrust/device_vector.h>
-#include <thrust/sort.h>
+#include "lidar_centerpoint/postprocess/postprocess_kernel.hpp"
+#include "thrust/count.h"
+#include "thrust/device_vector.h"
+#include "thrust/sort.h"
 
 namespace
 {
@@ -139,10 +137,6 @@ __global__ void generateBoxes3D_kernel(
 
 PostProcessCUDA::PostProcessCUDA(const CenterPointConfig & config) : config_(config)
 {
-  const auto num_raw_boxes3d = config.down_grid_size_y_ * config.down_grid_size_x_;
-  boxes3d_d_ = thrust::device_vector<Box3D>(num_raw_boxes3d);
-  yaw_norm_thresholds_d_ = thrust::device_vector<float>(
-    config_.yaw_norm_thresholds_.begin(), config_.yaw_norm_thresholds_.end());
 }
 
 // cspell: ignore divup
@@ -155,23 +149,26 @@ cudaError_t PostProcessCUDA::generateDetectedBoxes3D_launch(
     divup(config_.down_grid_size_y_, THREADS_PER_BLOCK),
     divup(config_.down_grid_size_x_, THREADS_PER_BLOCK));
   dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
+  auto boxes3d_d =
+    thrust::device_vector<Box3D>(config_.down_grid_size_y_ * config_.down_grid_size_x_);
+  auto yaw_norm_thresholds_d = thrust::device_vector<float>(
+    config_.yaw_norm_thresholds_.begin(), config_.yaw_norm_thresholds_.end());
   generateBoxes3D_kernel<<<blocks, threads, 0, stream>>>(
     out_heatmap, out_offset, out_z, out_dim, out_rot, out_vel, config_.voxel_size_x_,
     config_.voxel_size_y_, config_.range_min_x_, config_.range_min_y_, config_.down_grid_size_x_,
     config_.down_grid_size_y_, config_.downsample_factor_, config_.class_size_,
-    config_.has_variance_, thrust::raw_pointer_cast(yaw_norm_thresholds_d_.data()),
-    thrust::raw_pointer_cast(boxes3d_d_.data()));
+    config_.has_variance_, thrust::raw_pointer_cast(yaw_norm_thresholds_d.data()),
+    thrust::raw_pointer_cast(boxes3d_d.data()));
 
   // suppress by score
   const auto num_det_boxes3d = thrust::count_if(
-    thrust::device, boxes3d_d_.begin(), boxes3d_d_.end(),
-    is_score_greater(config_.score_threshold_));
+    thrust::device, boxes3d_d.begin(), boxes3d_d.end(), is_score_greater(config_.score_threshold_));
   if (num_det_boxes3d == 0) {
     return cudaGetLastError();
   }
   thrust::device_vector<Box3D> det_boxes3d_d(num_det_boxes3d);
   thrust::copy_if(
-    thrust::device, boxes3d_d_.begin(), boxes3d_d_.end(), det_boxes3d_d.begin(),
+    thrust::device, boxes3d_d.begin(), boxes3d_d.end(), det_boxes3d_d.begin(),
     is_score_greater(config_.score_threshold_));
 
   // sort by score
diff --git a/perception/lidar_centerpoint/lib/preprocess/pointcloud_densification.cpp b/perception/lidar_centerpoint/lib/preprocess/pointcloud_densification.cpp
index 66c348fda50b4..2859d58d8f669 100644
--- a/perception/lidar_centerpoint/lib/preprocess/pointcloud_densification.cpp
+++ b/perception/lidar_centerpoint/lib/preprocess/pointcloud_densification.cpp
@@ -14,20 +14,19 @@
 
 #include "lidar_centerpoint/preprocess/pointcloud_densification.hpp"
 
-#include <pcl_ros/transforms.hpp>
+#include "pcl_conversions/pcl_conversions.h"
+#include "pcl_ros/transforms.hpp"
 
-#include <boost/optional.hpp>
+#include "boost/optional.hpp"
 
-#include <pcl_conversions/pcl_conversions.h>
+#include <string>
+#include <utility>
 #ifdef ROS_DISTRO_GALACTIC
-#include <tf2_eigen/tf2_eigen.h>
+#include "tf2_eigen/tf2_eigen.h"
 #else
-#include <tf2_eigen/tf2_eigen.hpp>
+#include "tf2_eigen/tf2_eigen.hpp"
 #endif
 
-#include <string>
-#include <utility>
-
 namespace
 {
 boost::optional<geometry_msgs::msg::Transform> getTransform(
@@ -61,7 +60,8 @@ PointCloudDensification::PointCloudDensification(const DensificationParam & para
 }
 
 bool PointCloudDensification::enqueuePointCloud(
-  const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
+  const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
+  cudaStream_t stream)
 {
   const auto header = pointcloud_msg.header;
 
@@ -73,9 +73,9 @@ bool PointCloudDensification::enqueuePointCloud(
     }
     auto affine_world2current = transformToEigen(transform_world2current.get());
 
-    enqueue(pointcloud_msg, affine_world2current);
+    enqueue(pointcloud_msg, affine_world2current, stream);
   } else {
-    enqueue(pointcloud_msg, Eigen::Affine3f::Identity());
+    enqueue(pointcloud_msg, Eigen::Affine3f::Identity(), stream);
   }
 
   dequeue();
@@ -84,12 +84,24 @@ bool PointCloudDensification::enqueuePointCloud(
 }
 
 void PointCloudDensification::enqueue(
-  const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current)
+  const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current,
+  cudaStream_t stream)
 {
   affine_world2current_ = affine_world2current;
   current_timestamp_ = rclcpp::Time(msg.header.stamp).seconds();
-  PointCloudWithTransform pointcloud = {msg, affine_world2current.inverse()};
-  pointcloud_cache_.push_front(pointcloud);
+
+  assert(sizeof(uint8_t) * msg.width * msg.height * msg.point_step % sizeof(float) == 0);
+  auto points_d = cuda::make_unique<float[]>(
+    sizeof(uint8_t) * msg.width * msg.height * msg.point_step / sizeof(float));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    points_d.get(), msg.data.data(), sizeof(uint8_t) * msg.width * msg.height * msg.point_step,
+    cudaMemcpyHostToDevice, stream));
+
+  PointCloudWithTransform pointcloud = {
+    std::move(points_d), msg.header, msg.width * msg.height, msg.point_step,
+    affine_world2current.inverse()};
+
+  pointcloud_cache_.push_front(std::move(pointcloud));
 }
 
 void PointCloudDensification::dequeue()
diff --git a/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu b/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
index 218aaee125c02..01f62f22e892b 100644
--- a/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
+++ b/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
@@ -28,9 +28,12 @@
  * limitations under the License.
  */
 
+#include "lidar_centerpoint/cuda_utils.hpp"
 #include "lidar_centerpoint/preprocess/preprocess_kernel.hpp"
+#include "lidar_centerpoint/utils.hpp"
 
-#include <lidar_centerpoint/utils.hpp>
+#include <cassert>
+#include <cmath>
 
 namespace
 {
@@ -41,10 +44,97 @@ const std::size_t ENCODER_IN_FEATURE_SIZE = 9;  // the same as encoder_in_featur
 
 namespace centerpoint
 {
+
+__global__ void generateSweepPoints_kernel(
+  const float * input_points, std::size_t points_size, int input_point_step, float time_lag,
+  const float * transform_array, int num_features, float * output_points)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (point_idx >= points_size) return;
+
+  const float input_x = input_points[point_idx * input_point_step + 0];
+  const float input_y = input_points[point_idx * input_point_step + 1];
+  const float input_z = input_points[point_idx * input_point_step + 2];
+
+  // transform_array is expected to be column-major
+  output_points[point_idx * num_features + 0] = transform_array[0] * input_x +
+                                                transform_array[4] * input_y +
+                                                transform_array[8] * input_z + transform_array[12];
+  output_points[point_idx * num_features + 1] = transform_array[1] * input_x +
+                                                transform_array[5] * input_y +
+                                                transform_array[9] * input_z + transform_array[13];
+  output_points[point_idx * num_features + 2] = transform_array[2] * input_x +
+                                                transform_array[6] * input_y +
+                                                transform_array[10] * input_z + transform_array[14];
+  output_points[point_idx * num_features + 3] = time_lag;
+}
+
+cudaError_t generateSweepPoints_launch(
+  const float * input_points, std::size_t points_size, int input_point_step, float time_lag,
+  const float * transform_array, int num_features, float * output_points, cudaStream_t stream)
+{
+  auto transform_d = cuda::make_unique<float[]>(16);
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    transform_d.get(), transform_array, 16 * sizeof(float), cudaMemcpyHostToDevice, stream));
+
+  dim3 blocks((points_size + 256 - 1) / 256);
+  dim3 threads(256);
+  assert(num_features == 4);
+
+  generateSweepPoints_kernel<<<blocks, threads, 0, stream>>>(
+    input_points, points_size, input_point_step, time_lag, transform_d.get(), num_features,
+    output_points);
+
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+__global__ void shufflePoints_kernel(
+  const float * points, const unsigned int * indices, float * shuffled_points,
+  const std::size_t points_size, const std::size_t max_size, const std::size_t offset)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (point_idx >= max_size) return;
+
+  int src_idx = indices[(point_idx + offset) % max_size];
+  int dst_idx = point_idx;
+
+  if (dst_idx >= points_size) {
+    shuffled_points[4 * dst_idx + 0] = INFINITY;
+    shuffled_points[4 * dst_idx + 1] = INFINITY;
+    shuffled_points[4 * dst_idx + 2] = INFINITY;
+    shuffled_points[4 * dst_idx + 3] = INFINITY;
+  } else {
+    shuffled_points[4 * dst_idx + 0] = points[4 * src_idx + 0];
+    shuffled_points[4 * dst_idx + 1] = points[4 * src_idx + 1];
+    shuffled_points[4 * dst_idx + 2] = points[4 * src_idx + 2];
+    shuffled_points[4 * dst_idx + 3] = points[4 * src_idx + 3];
+  }
+}
+
+cudaError_t shufflePoints_launch(
+  const float * points, const unsigned int * indices, float * shuffled_points,
+  const std::size_t points_size, const std::size_t max_size, const std::size_t offset,
+  cudaStream_t stream)
+{
+  dim3 blocks((max_size + 256 - 1) / 256);
+  dim3 threads(256);
+
+  if (blocks.x == 0) {
+    return cudaGetLastError();
+  }
+
+  shufflePoints_kernel<<<blocks, threads, 0, stream>>>(
+    points, indices, shuffled_points, points_size, max_size, offset);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
 __global__ void generateVoxels_random_kernel(
-  const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
-  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
-  float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels)
+  const float * points, std::size_t points_size, float min_x_range, float max_x_range,
+  float min_y_range, float max_y_range, float min_z_range, float max_z_range, float pillar_x_size,
+  float pillar_y_size, float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask,
+  float * voxels)
 {
   int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (point_idx >= points_size) return;
@@ -71,10 +161,10 @@ __global__ void generateVoxels_random_kernel(
 }
 
 cudaError_t generateVoxels_random_launch(
-  const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
-  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
-  float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels,
-  cudaStream_t stream)
+  const float * points, std::size_t points_size, float min_x_range, float max_x_range,
+  float min_y_range, float max_y_range, float min_z_range, float max_z_range, float pillar_x_size,
+  float pillar_y_size, float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask,
+  float * voxels, cudaStream_t stream)
 {
   dim3 blocks((points_size + 256 - 1) / 256);
   dim3 threads(256);
diff --git a/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp b/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
index 07a1a2de725f5..9148ed993c9c3 100644
--- a/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
+++ b/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
@@ -14,7 +14,12 @@
 
 #include "lidar_centerpoint/preprocess/voxel_generator.hpp"
 
-#include <sensor_msgs/point_cloud2_iterator.hpp>
+#include "lidar_centerpoint/centerpoint_trt.hpp"
+#include "lidar_centerpoint/preprocess/preprocess_kernel.hpp"
+
+#include "sensor_msgs/point_cloud2_iterator.hpp"
+
+#include <type_traits>
 
 namespace centerpoint
 {
@@ -38,35 +43,40 @@ VoxelGeneratorTemplate::VoxelGeneratorTemplate(
 }
 
 bool VoxelGeneratorTemplate::enqueuePointCloud(
-  const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
+  const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
+  cudaStream_t stream)
 {
-  return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
+  return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer, stream);
 }
 
-std::size_t VoxelGenerator::generateSweepPoints(std::vector<float> & points)
+std::size_t VoxelGenerator::generateSweepPoints(float * points_d, cudaStream_t stream)
 {
-  Eigen::Vector3f point_current, point_past;
-  size_t point_counter{};
+  std::size_t point_counter = 0;
   for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter);
        pc_cache_iter++) {
-    auto pc_msg = pc_cache_iter->pointcloud_msg;
+    auto sweep_num_points = pc_cache_iter->num_points;
+    auto point_step = pc_cache_iter->point_step;
     auto affine_past2current =
       pd_ptr_->getAffineWorldToCurrent() * pc_cache_iter->affine_past2world;
     float time_lag = static_cast<float>(
-      pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_msg.header.stamp).seconds());
-
-    for (sensor_msgs::PointCloud2ConstIterator<float> x_iter(pc_msg, "x"), y_iter(pc_msg, "y"),
-         z_iter(pc_msg, "z");
-         x_iter != x_iter.end(); ++x_iter, ++y_iter, ++z_iter) {
-      point_past << *x_iter, *y_iter, *z_iter;
-      point_current = affine_past2current * point_past;
+      pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_cache_iter->header.stamp).seconds());
 
-      points.at(point_counter * config_.point_feature_size_) = point_current.x();
-      points.at(point_counter * config_.point_feature_size_ + 1) = point_current.y();
-      points.at(point_counter * config_.point_feature_size_ + 2) = point_current.z();
-      points.at(point_counter * config_.point_feature_size_ + 3) = time_lag;
-      ++point_counter;
+    if (point_counter + sweep_num_points > config_.cloud_capacity_) {
+      RCLCPP_WARN_STREAM(
+        rclcpp::get_logger("lidar_centerpoint"),
+        "Requested number of points exceeds the maximum capacity. Current points = "
+          << point_counter);
+      break;
     }
+
+    static_assert(std::is_same<decltype(affine_past2current.matrix()), Eigen::Matrix4f &>::value);
+    static_assert(!Eigen::Matrix4f::IsRowMajor, "matrices should be col-major.");
+    generateSweepPoints_launch(
+      pc_cache_iter->points_d.get(), sweep_num_points, point_step / sizeof(float), time_lag,
+      affine_past2current.matrix().data(), config_.point_feature_size_,
+      points_d + config_.point_feature_size_ * point_counter, stream);
+
+    point_counter += sweep_num_points;
   }
   return point_counter;
 }
diff --git a/perception/lidar_centerpoint/lib/ros_utils.cpp b/perception/lidar_centerpoint/lib/ros_utils.cpp
index 039d340cc2dc4..f9dfbee73a88c 100644
--- a/perception/lidar_centerpoint/lib/ros_utils.cpp
+++ b/perception/lidar_centerpoint/lib/ros_utils.cpp
@@ -50,7 +50,7 @@ void box3DToDetectedObject(
 
   // pose and shape
   // mmdet3d yaw format to ros yaw format
-  float yaw = -box3d.yaw - autoware::universe_utils::pi / 2;
+  const float yaw = -box3d.yaw - autoware::universe_utils::pi / 2;
   obj.kinematics.pose_with_covariance.pose.position =
     autoware::universe_utils::createPoint(box3d.x, box3d.y, box3d.z);
   obj.kinematics.pose_with_covariance.pose.orientation =
@@ -67,14 +67,18 @@ void box3DToDetectedObject(
   if (has_twist) {
     float vel_x = box3d.vel_x;
     float vel_y = box3d.vel_y;
+
+    // twist of the object is based on the object coordinate system
     geometry_msgs::msg::Twist twist;
-    twist.linear.x = std::sqrt(std::pow(vel_x, 2) + std::pow(vel_y, 2));
-    twist.angular.z = 2 * (std::atan2(vel_y, vel_x) - yaw);
+    twist.linear.x = std::cos(yaw) * vel_x + std::sin(yaw) * vel_y;
+    twist.linear.y = -std::sin(yaw) * vel_x + std::cos(yaw) * vel_y;
+    twist.angular.z = 0;  // angular velocity is not supported
+
     obj.kinematics.twist_with_covariance.twist = twist;
     obj.kinematics.has_twist = has_twist;
     if (has_variance) {
       obj.kinematics.has_twist_covariance = has_variance;
-      obj.kinematics.twist_with_covariance.covariance = convertTwistCovarianceMatrix(box3d);
+      obj.kinematics.twist_with_covariance.covariance = convertTwistCovarianceMatrix(box3d, yaw);
     }
   }
 }
@@ -111,12 +115,21 @@ std::array<double, 36> convertPoseCovarianceMatrix(const Box3D & box3d)
   return pose_covariance;
 }
 
-std::array<double, 36> convertTwistCovarianceMatrix(const Box3D & box3d)
+std::array<double, 36> convertTwistCovarianceMatrix(const Box3D & box3d, const float yaw)
 {
   using POSE_IDX = autoware::universe_utils::xyzrpy_covariance_index::XYZRPY_COV_IDX;
+
+  // twist covariance matrix is based on the object coordinate system
   std::array<double, 36> twist_covariance{};
-  twist_covariance[POSE_IDX::X_X] = box3d.vel_x_variance;
-  twist_covariance[POSE_IDX::Y_Y] = box3d.vel_y_variance;
+  const float cos_yaw = std::cos(yaw);
+  const float sin_yaw = std::sin(yaw);
+  twist_covariance[POSE_IDX::X_X] =
+    box3d.vel_x_variance * cos_yaw * cos_yaw + box3d.vel_y_variance * sin_yaw * sin_yaw;
+  twist_covariance[POSE_IDX::X_Y] =
+    (box3d.vel_y_variance - box3d.vel_x_variance) * sin_yaw * cos_yaw;
+  twist_covariance[POSE_IDX::Y_X] = twist_covariance[POSE_IDX::X_Y];
+  twist_covariance[POSE_IDX::Y_Y] =
+    box3d.vel_x_variance * sin_yaw * sin_yaw + box3d.vel_y_variance * cos_yaw * cos_yaw;
   return twist_covariance;
 }
 
diff --git a/perception/lidar_centerpoint/lib/utils.cpp b/perception/lidar_centerpoint/lib/utils.cpp
index b6e0a54ab6de9..ade0bce8a4ff3 100644
--- a/perception/lidar_centerpoint/lib/utils.cpp
+++ b/perception/lidar_centerpoint/lib/utils.cpp
@@ -19,7 +19,7 @@
 namespace centerpoint
 {
 // cspell: ignore divup
-std::size_t divup(const std::size_t a, const std::size_t b)
+std::size_t divup(const std::size_t a, const std::size_t b)  // cppcheck-suppress unusedFunction
 {
   if (a == 0) {
     throw std::runtime_error("A dividend of divup isn't positive.");
diff --git a/perception/lidar_centerpoint/package.xml b/perception/lidar_centerpoint/package.xml
index 280e2219fb1f1..4c6fa471c297a 100644
--- a/perception/lidar_centerpoint/package.xml
+++ b/perception/lidar_centerpoint/package.xml
@@ -16,7 +16,6 @@
   <depend>autoware_universe_utils</depend>
   <depend>object_recognition_utils</depend>
   <depend>pcl_ros</depend>
-  <depend>python3-open3d</depend>
   <depend>rclcpp</depend>
   <depend>rclcpp_components</depend>
   <depend>tensorrt_common</depend>
diff --git a/perception/lidar_centerpoint/schema/centerpoint.schemal.json b/perception/lidar_centerpoint/schema/centerpoint.schemal.json
index b11c115cc2466..c3268f0e90d9b 100644
--- a/perception/lidar_centerpoint/schema/centerpoint.schemal.json
+++ b/perception/lidar_centerpoint/schema/centerpoint.schemal.json
@@ -12,6 +12,12 @@
           "default": "fp16",
           "enum": ["fp32", "fp16"]
         },
+        "cloud_capacity": {
+          "type": "integer",
+          "description": "Capacity of the point cloud buffer (should be set to at least the maximum theoretical number of points).",
+          "default": 2000000,
+          "minimum": 1
+        },
         "post_process_params": {
           "type": "object",
           "properties": {
diff --git a/perception/lidar_centerpoint/src/node.cpp b/perception/lidar_centerpoint/src/node.cpp
index f9687a6342c9c..c42ef785aed51 100644
--- a/perception/lidar_centerpoint/src/node.cpp
+++ b/perception/lidar_centerpoint/src/node.cpp
@@ -14,17 +14,7 @@
 
 #include "lidar_centerpoint/node.hpp"
 
-#include <lidar_centerpoint/centerpoint_config.hpp>
-#include <lidar_centerpoint/preprocess/pointcloud_densification.hpp>
-#include <lidar_centerpoint/ros_utils.hpp>
-#include <lidar_centerpoint/utils.hpp>
-#include <pcl_ros/transforms.hpp>
-
-#ifdef ROS_DISTRO_GALACTIC
-#include <tf2_geometry_msgs/tf2_geometry_msgs.h>
-#else
-#include <tf2_geometry_msgs/tf2_geometry_msgs.hpp>
-#endif
+#include "pcl_ros/transforms.hpp"
 
 #include <Eigen/Dense>
 #include <Eigen/Geometry>
@@ -33,6 +23,17 @@
 #include <string>
 #include <vector>
 
+#ifdef ROS_DISTRO_GALACTIC
+#include "tf2_geometry_msgs/tf2_geometry_msgs.h"
+#else
+#include "tf2_geometry_msgs/tf2_geometry_msgs.hpp"
+#endif
+
+#include "lidar_centerpoint/centerpoint_config.hpp"
+#include "lidar_centerpoint/preprocess/pointcloud_densification.hpp"
+#include "lidar_centerpoint/ros_utils.hpp"
+#include "lidar_centerpoint/utils.hpp"
+
 namespace centerpoint
 {
 LidarCenterPointNode::LidarCenterPointNode(const rclcpp::NodeOptions & node_options)
@@ -49,6 +50,9 @@ LidarCenterPointNode::LidarCenterPointNode(const rclcpp::NodeOptions & node_opti
   const int densification_num_past_frames =
     this->declare_parameter<int>("densification_params.num_past_frames");
   const std::string trt_precision = this->declare_parameter<std::string>("trt_precision");
+  const std::size_t cloud_capacity = this->declare_parameter<std::int64_t>(
+    "cloud_capacity", 2000000);  // NOTE(knzo25): this is a temporal hack since the launcher for
+                                 // this release does not have the corresponding parameter yet
   const std::string encoder_onnx_path = this->declare_parameter<std::string>("encoder_onnx_path");
   const std::string encoder_engine_path =
     this->declare_parameter<std::string>("encoder_engine_path");
@@ -103,9 +107,9 @@ LidarCenterPointNode::LidarCenterPointNode(const rclcpp::NodeOptions & node_opti
       "The size of voxel_size != 3: use the default parameters.");
   }
   CenterPointConfig config(
-    class_names_.size(), point_feature_size, max_voxel_size, point_cloud_range, voxel_size,
-    downsample_factor, encoder_in_feature_size, score_threshold, circle_nms_dist_threshold,
-    yaw_norm_thresholds, has_variance_);
+    class_names_.size(), point_feature_size, cloud_capacity, max_voxel_size, point_cloud_range,
+    voxel_size, downsample_factor, encoder_in_feature_size, score_threshold,
+    circle_nms_dist_threshold, yaw_norm_thresholds, has_variance_);
   detector_ptr_ =
     std::make_unique<CenterPointTRT>(encoder_param, head_param, densification_param, config);
 
diff --git a/perception/lidar_centerpoint/test/test_postprocess_kernel.cpp b/perception/lidar_centerpoint/test/test_postprocess_kernel.cpp
index fde4fcbee3d6c..cf09df79a96d3 100644
--- a/perception/lidar_centerpoint/test/test_postprocess_kernel.cpp
+++ b/perception/lidar_centerpoint/test/test_postprocess_kernel.cpp
@@ -29,6 +29,7 @@ void PostprocessKernelTest::SetUp()
 
   constexpr std::size_t class_size{5};
   constexpr std::size_t point_feature_size{4};
+  const std::size_t cloud_capacity{2000000};
   constexpr std::size_t max_voxel_size{100000000};
   const std::vector<double> point_cloud_range{-76.8, -76.8, -4.0, 76.8, 76.8, 6.0};
   const std::vector<double> voxel_size{0.32, 0.32, 10.0};
@@ -40,7 +41,7 @@ void PostprocessKernelTest::SetUp()
   constexpr bool has_variance{false};
 
   config_ptr_ = std::make_unique<centerpoint::CenterPointConfig>(
-    class_size, point_feature_size, max_voxel_size, point_cloud_range, voxel_size,
+    class_size, point_feature_size, cloud_capacity, max_voxel_size, point_cloud_range, voxel_size,
     downsample_factor, encoder_in_feature_size, score_threshold, circle_nms_dist_threshold,
     yaw_norm_thresholds, has_variance);
 
@@ -94,12 +95,12 @@ TEST_F(PostprocessKernelTest, SingleDetectionTest)
   constexpr float detection_x = 70.f;
   constexpr float detection_y = -38.4f;
   constexpr float detection_z = 1.0;
-  constexpr float detection_log_w = std::log(7.0);
-  constexpr float detection_log_l = std::log(1.0);
-  constexpr float detection_log_h = std::log(2.0);
+  const float detection_log_w = std::log(7.0);
+  const float detection_log_l = std::log(1.0);
+  const float detection_log_h = std::log(2.0);
   constexpr float detection_yaw = M_PI_4;
-  constexpr float detection_yaw_sin = std::sin(detection_yaw);
-  constexpr float detection_yaw_cos = std::sin(detection_yaw);
+  const float detection_yaw_sin = std::sin(detection_yaw);
+  const float detection_yaw_cos = std::sin(detection_yaw);
   constexpr float detection_vel_x = 5.0;
   constexpr float detection_vel_y = -5.0;
 
@@ -240,9 +241,9 @@ TEST_F(PostprocessKernelTest, CircleNMSTest)
   constexpr float detection_x = 70.f;
   constexpr float detection_y = -38.4f;
   constexpr float detection_z = 1.0;
-  constexpr float detection_log_w = std::log(7.0);
-  constexpr float detection_log_l = std::log(1.0);
-  constexpr float detection_log_h = std::log(2.0);
+  const float detection_log_w = std::log(7.0);
+  const float detection_log_l = std::log(1.0);
+  const float detection_log_h = std::log(2.0);
   constexpr float detection_yaw1_sin = 0.0;
   constexpr float detection_yaw1_cos = 1.0;
   constexpr float detection_yaw2_sin = 1.0;
diff --git a/perception/lidar_centerpoint/test/test_postprocess_kernel.hpp b/perception/lidar_centerpoint/test/test_postprocess_kernel.hpp
index d0c9123da9e77..8f35b98119afe 100644
--- a/perception/lidar_centerpoint/test/test_postprocess_kernel.hpp
+++ b/perception/lidar_centerpoint/test/test_postprocess_kernel.hpp
@@ -31,17 +31,17 @@ class PostprocessKernelTest : public testing::Test
   void SetUp() override;
   void TearDown() override;
 
-  cudaStream_t stream_{nullptr};
+  cudaStream_t stream_{};
 
-  std::unique_ptr<PostProcessCUDA> postprocess_cuda_ptr_;
-  std::unique_ptr<centerpoint::CenterPointConfig> config_ptr_;
+  std::unique_ptr<PostProcessCUDA> postprocess_cuda_ptr_{};
+  std::unique_ptr<centerpoint::CenterPointConfig> config_ptr_{};
 
-  cuda::unique_ptr<float[]> head_out_heatmap_d_;
-  cuda::unique_ptr<float[]> head_out_offset_d_;
-  cuda::unique_ptr<float[]> head_out_z_d_;
-  cuda::unique_ptr<float[]> head_out_dim_d_;
-  cuda::unique_ptr<float[]> head_out_rot_d_;
-  cuda::unique_ptr<float[]> head_out_vel_d_;
+  cuda::unique_ptr<float[]> head_out_heatmap_d_{};
+  cuda::unique_ptr<float[]> head_out_offset_d_{};
+  cuda::unique_ptr<float[]> head_out_z_d_{};
+  cuda::unique_ptr<float[]> head_out_dim_d_{};
+  cuda::unique_ptr<float[]> head_out_rot_d_{};
+  cuda::unique_ptr<float[]> head_out_vel_d_{};
 };
 
 }  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/test/test_preprocess_kernel.hpp b/perception/lidar_centerpoint/test/test_preprocess_kernel.hpp
index 57ff51966a417..1a3e850026886 100644
--- a/perception/lidar_centerpoint/test/test_preprocess_kernel.hpp
+++ b/perception/lidar_centerpoint/test/test_preprocess_kernel.hpp
@@ -28,7 +28,7 @@ class PreprocessKernelTest : public testing::Test
   void SetUp() override;
   void TearDown() override;
 
-  cudaStream_t stream_{nullptr};
+  cudaStream_t stream_{};
 
   std::size_t max_voxel_size_{};
   std::size_t max_point_in_voxel_size_{};
diff --git a/perception/lidar_centerpoint/test/test_ros_utils.cpp b/perception/lidar_centerpoint/test/test_ros_utils.cpp
index 6044a3b6e77c9..2c900da4e2e0f 100644
--- a/perception/lidar_centerpoint/test/test_ros_utils.cpp
+++ b/perception/lidar_centerpoint/test/test_ros_utils.cpp
@@ -122,12 +122,13 @@ TEST(TestSuite, convertPoseCovarianceMatrix)
 TEST(TestSuite, convertTwistCovarianceMatrix)
 {
   centerpoint::Box3D box3d;
-  box3d.vel_x_variance = 0.1;
+  box3d.vel_x_variance = 0.5;
   box3d.vel_y_variance = 0.2;
+  float yaw = 0;
 
-  std::array<double, 36> twist_covariance = centerpoint::convertTwistCovarianceMatrix(box3d);
+  std::array<double, 36> twist_covariance = centerpoint::convertTwistCovarianceMatrix(box3d, yaw);
 
-  EXPECT_FLOAT_EQ(twist_covariance[0], 0.1);
+  EXPECT_FLOAT_EQ(twist_covariance[0], 0.5);
   EXPECT_FLOAT_EQ(twist_covariance[7], 0.2);
 }
 
diff --git a/perception/lidar_centerpoint/test/test_voxel_generator.cpp b/perception/lidar_centerpoint/test/test_voxel_generator.cpp
index 5b0b3cd112e6c..ea4565aba5a5c 100644
--- a/perception/lidar_centerpoint/test/test_voxel_generator.cpp
+++ b/perception/lidar_centerpoint/test/test_voxel_generator.cpp
@@ -14,9 +14,9 @@
 
 #include "test_voxel_generator.hpp"
 
-#include <sensor_msgs/point_cloud2_iterator.hpp>
+#include "gtest/gtest.h"
 
-#include <gtest/gtest.h>
+#include "sensor_msgs/point_cloud2_iterator.hpp"
 
 void VoxelGeneratorTest::SetUp()
 {
@@ -31,6 +31,7 @@ void VoxelGeneratorTest::SetUp()
 
   class_size_ = 5;
   point_feature_size_ = 4;
+  cloud_capacity_ = 2000000;
   max_voxel_size_ = 100000000;
   point_cloud_range_ = std::vector<double>{-76.8, -76.8, -4.0, 76.8, 76.8, 6.0};
   voxel_size_ = std::vector<double>{0.32, 0.32, 10.0};
@@ -95,6 +96,8 @@ void VoxelGeneratorTest::SetUp()
   transform2_ = transform1_;
   transform2_.header.stamp = cloud2_->header.stamp;
   transform2_.transform.translation.x = world_origin_x + delta_pointcloud_x_;
+
+  cudaStreamCreate(&stream_);
 }
 
 void VoxelGeneratorTest::TearDown()
@@ -108,17 +111,26 @@ TEST_F(VoxelGeneratorTest, SingleFrame)
   centerpoint::DensificationParam param(world_frame_, num_past_frames);
 
   centerpoint::CenterPointConfig config(
-    class_size_, point_feature_size_, max_voxel_size_, point_cloud_range_, voxel_size_,
-    downsample_factor_, encoder_in_feature_size_, score_threshold_, circle_nms_dist_threshold_,
-    yaw_norm_thresholds_, has_variance_);
+    class_size_, point_feature_size_, cloud_capacity_, max_voxel_size_, point_cloud_range_,
+    voxel_size_, downsample_factor_, encoder_in_feature_size_, score_threshold_,
+    circle_nms_dist_threshold_, yaw_norm_thresholds_, has_variance_);
 
   centerpoint::VoxelGenerator voxel_generator(param, config);
   std::vector<float> points;
   points.resize(capacity_ * config.point_feature_size_);
   std::fill(points.begin(), points.end(), std::nan(""));
 
-  bool status1 = voxel_generator.enqueuePointCloud(*cloud1_, *tf2_buffer_);
-  std::size_t generated_points_num = voxel_generator.generateSweepPoints(points);
+  auto points_d = cuda::make_unique<float[]>(capacity_ * config.point_feature_size_);
+  cudaMemcpy(
+    points_d.get(), points.data(), capacity_ * config.point_feature_size_ * sizeof(float),
+    cudaMemcpyHostToDevice);
+
+  bool status1 = voxel_generator.enqueuePointCloud(*cloud1_, *tf2_buffer_, stream_);
+  std::size_t generated_points_num = voxel_generator.generateSweepPoints(points_d.get(), stream_);
+
+  cudaMemcpy(
+    points.data(), points_d.get(), capacity_ * config.point_feature_size_ * sizeof(float),
+    cudaMemcpyDeviceToHost);
 
   EXPECT_TRUE(status1);
   EXPECT_EQ(points_per_pointcloud_, generated_points_num);
@@ -146,18 +158,27 @@ TEST_F(VoxelGeneratorTest, TwoFramesNoTf)
   centerpoint::DensificationParam param(world_frame_, num_past_frames);
 
   centerpoint::CenterPointConfig config(
-    class_size_, point_feature_size_, max_voxel_size_, point_cloud_range_, voxel_size_,
-    downsample_factor_, encoder_in_feature_size_, score_threshold_, circle_nms_dist_threshold_,
-    yaw_norm_thresholds_, has_variance_);
+    class_size_, point_feature_size_, cloud_capacity_, max_voxel_size_, point_cloud_range_,
+    voxel_size_, downsample_factor_, encoder_in_feature_size_, score_threshold_,
+    circle_nms_dist_threshold_, yaw_norm_thresholds_, has_variance_);
 
   centerpoint::VoxelGenerator voxel_generator(param, config);
   std::vector<float> points;
   points.resize(capacity_ * config.point_feature_size_);
   std::fill(points.begin(), points.end(), std::nan(""));
 
-  bool status1 = voxel_generator.enqueuePointCloud(*cloud1_, *tf2_buffer_);
-  bool status2 = voxel_generator.enqueuePointCloud(*cloud2_, *tf2_buffer_);
-  std::size_t generated_points_num = voxel_generator.generateSweepPoints(points);
+  auto points_d = cuda::make_unique<float[]>(capacity_ * config.point_feature_size_);
+  cudaMemcpy(
+    points_d.get(), points.data(), capacity_ * config.point_feature_size_ * sizeof(float),
+    cudaMemcpyHostToDevice);
+
+  bool status1 = voxel_generator.enqueuePointCloud(*cloud1_, *tf2_buffer_, stream_);
+  bool status2 = voxel_generator.enqueuePointCloud(*cloud2_, *tf2_buffer_, stream_);
+  std::size_t generated_points_num = voxel_generator.generateSweepPoints(points_d.get(), stream_);
+
+  cudaMemcpy(
+    points.data(), points_d.get(), capacity_ * config.point_feature_size_ * sizeof(float),
+    cudaMemcpyDeviceToHost);
 
   EXPECT_FALSE(status1);
   EXPECT_FALSE(status2);
@@ -171,21 +192,30 @@ TEST_F(VoxelGeneratorTest, TwoFrames)
   centerpoint::DensificationParam param(world_frame_, num_past_frames);
 
   centerpoint::CenterPointConfig config(
-    class_size_, point_feature_size_, max_voxel_size_, point_cloud_range_, voxel_size_,
-    downsample_factor_, encoder_in_feature_size_, score_threshold_, circle_nms_dist_threshold_,
-    yaw_norm_thresholds_, has_variance_);
+    class_size_, point_feature_size_, cloud_capacity_, max_voxel_size_, point_cloud_range_,
+    voxel_size_, downsample_factor_, encoder_in_feature_size_, score_threshold_,
+    circle_nms_dist_threshold_, yaw_norm_thresholds_, has_variance_);
 
   centerpoint::VoxelGenerator voxel_generator(param, config);
   std::vector<float> points;
   points.resize(capacity_ * config.point_feature_size_);
   std::fill(points.begin(), points.end(), std::nan(""));
 
+  auto points_d = cuda::make_unique<float[]>(capacity_ * config.point_feature_size_);
+  cudaMemcpy(
+    points_d.get(), points.data(), capacity_ * config.point_feature_size_ * sizeof(float),
+    cudaMemcpyHostToDevice);
+
   tf2_buffer_->setTransform(transform1_, "authority1");
   tf2_buffer_->setTransform(transform2_, "authority1");
 
-  bool status1 = voxel_generator.enqueuePointCloud(*cloud1_, *tf2_buffer_);
-  bool status2 = voxel_generator.enqueuePointCloud(*cloud2_, *tf2_buffer_);
-  std::size_t generated_points_num = voxel_generator.generateSweepPoints(points);
+  bool status1 = voxel_generator.enqueuePointCloud(*cloud1_, *tf2_buffer_, stream_);
+  bool status2 = voxel_generator.enqueuePointCloud(*cloud2_, *tf2_buffer_, stream_);
+  std::size_t generated_points_num = voxel_generator.generateSweepPoints(points_d.get(), stream_);
+
+  cudaMemcpy(
+    points.data(), points_d.get(), capacity_ * config.point_feature_size_ * sizeof(float),
+    cudaMemcpyDeviceToHost);
 
   EXPECT_TRUE(status1);
   EXPECT_TRUE(status2);
diff --git a/perception/lidar_centerpoint/test/test_voxel_generator.hpp b/perception/lidar_centerpoint/test/test_voxel_generator.hpp
index 8fb7d8dffa44d..5ca97fedc368c 100644
--- a/perception/lidar_centerpoint/test/test_voxel_generator.hpp
+++ b/perception/lidar_centerpoint/test/test_voxel_generator.hpp
@@ -33,34 +33,37 @@ class VoxelGeneratorTest : public testing::Test
   void TearDown() override;
 
   // These need to be public so that they can be accessed in the test cases
-  rclcpp::Node::SharedPtr node_;
+  rclcpp::Node::SharedPtr node_{};
 
-  std::unique_ptr<sensor_msgs::msg::PointCloud2> cloud1_, cloud2_;
-  geometry_msgs::msg::TransformStamped transform1_, transform2_;
+  std::unique_ptr<sensor_msgs::msg::PointCloud2> cloud1_{}, cloud2_{};
+  geometry_msgs::msg::TransformStamped transform1_{}, transform2_{};
 
-  std::unique_ptr<centerpoint::DensificationParam> densification_param_ptr_;
-  std::unique_ptr<centerpoint::CenterPointConfig> config_ptr_;
+  std::unique_ptr<centerpoint::DensificationParam> densification_param_ptr_{};
+  std::unique_ptr<centerpoint::CenterPointConfig> config_ptr_{};
 
-  std::unique_ptr<tf2_ros::Buffer> tf2_buffer_;
+  std::unique_ptr<tf2_ros::Buffer> tf2_buffer_{};
 
-  std::string world_frame_;
-  std::string lidar_frame_;
-  std::size_t points_per_pointcloud_;
-  std::size_t capacity_;
-  double delta_pointcloud_x_;
+  std::string world_frame_{};
+  std::string lidar_frame_{};
+  std::size_t points_per_pointcloud_{};
+  std::size_t capacity_{};
+  double delta_pointcloud_x_{};
 
-  std::size_t class_size_;
-  float point_feature_size_;
-  std::size_t max_voxel_size_;
+  std::size_t class_size_{};
+  float point_feature_size_{};
+  std::size_t cloud_capacity_{};
+  std::size_t max_voxel_size_{};
 
-  std::vector<double> point_cloud_range_;
-  std::vector<double> voxel_size_;
-  std::size_t downsample_factor_;
-  std::size_t encoder_in_feature_size_;
-  float score_threshold_;
-  float circle_nms_dist_threshold_;
-  std::vector<double> yaw_norm_thresholds_;
-  bool has_variance_;
+  std::vector<double> point_cloud_range_{};
+  std::vector<double> voxel_size_{};
+  std::size_t downsample_factor_{};
+  std::size_t encoder_in_feature_size_{};
+  float score_threshold_{};
+  float circle_nms_dist_threshold_{};
+  std::vector<double> yaw_norm_thresholds_{};
+  bool has_variance_{};
+
+  cudaStream_t stream_{};
 };
 
 #endif  // TEST_VOXEL_GENERATOR_HPP_
diff --git a/perception/lidar_transfusion/config/transfusion.param.yaml b/perception/lidar_transfusion/config/transfusion.param.yaml
index feabe359caf1f..2f1d56872dad3 100644
--- a/perception/lidar_transfusion/config/transfusion.param.yaml
+++ b/perception/lidar_transfusion/config/transfusion.param.yaml
@@ -3,6 +3,7 @@
     # network
     class_names: ["CAR", "TRUCK", "BUS", "BICYCLE", "PEDESTRIAN"]
     trt_precision: fp16
+    cloud_capacity: 2000000
     voxels_num: [5000, 30000, 60000]  # [min, opt, max]
     point_cloud_range: [-76.8, -76.8, -3.0, 76.8, 76.8, 5.0]  # [x_min, y_min, z_min, x_max, y_max, z_max]
     voxel_size: [0.3, 0.3, 8.0]  # [x, y, z]
diff --git a/perception/lidar_transfusion/include/lidar_transfusion/cuda_utils.hpp b/perception/lidar_transfusion/include/lidar_transfusion/cuda_utils.hpp
index 5c25a936d5392..0f7c72d065193 100644
--- a/perception/lidar_transfusion/include/lidar_transfusion/cuda_utils.hpp
+++ b/perception/lidar_transfusion/include/lidar_transfusion/cuda_utils.hpp
@@ -89,13 +89,13 @@ cuda::unique_ptr<T> make_unique()
   return cuda::unique_ptr<T>{p};
 }
 
-constexpr size_t CUDA_ALIGN = 256;
+constexpr std::size_t CUDA_ALIGN = 256;
 
 template <typename T>
-inline size_t get_size_aligned(size_t num_elem)
+inline std::size_t get_size_aligned(size_t num_elem)
 {
-  size_t size = num_elem * sizeof(T);
-  size_t extra_align = 0;
+  std::size_t size = num_elem * sizeof(T);
+  std::size_t extra_align = 0;
   if (size % CUDA_ALIGN != 0) {
     extra_align = CUDA_ALIGN - size % CUDA_ALIGN;
   }
@@ -103,9 +103,9 @@ inline size_t get_size_aligned(size_t num_elem)
 }
 
 template <typename T>
-inline T * get_next_ptr(size_t num_elem, void *& workspace, size_t & workspace_size)
+inline T * get_next_ptr(size_t num_elem, void *& workspace, std::size_t & workspace_size)
 {
-  size_t size = get_size_aligned<T>(num_elem);
+  std::size_t size = get_size_aligned<T>(num_elem);
   if (size > workspace_size) {
     throw ::std::runtime_error("Workspace is too small!");
   }
diff --git a/perception/lidar_transfusion/include/lidar_transfusion/transfusion_config.hpp b/perception/lidar_transfusion/include/lidar_transfusion/transfusion_config.hpp
index 31976de56a9da..4b1262c545533 100644
--- a/perception/lidar_transfusion/include/lidar_transfusion/transfusion_config.hpp
+++ b/perception/lidar_transfusion/include/lidar_transfusion/transfusion_config.hpp
@@ -25,10 +25,13 @@ class TransfusionConfig
 {
 public:
   TransfusionConfig(
-    const std::vector<int64_t> & voxels_num, const std::vector<double> & point_cloud_range,
-    const std::vector<double> & voxel_size, const float circle_nms_dist_threshold,
+    const std::size_t cloud_capacity, const std::vector<int64_t> & voxels_num,
+    const std::vector<double> & point_cloud_range, const std::vector<double> & voxel_size,
+    const std::size_t num_proposals, const float circle_nms_dist_threshold,
     const std::vector<double> & yaw_norm_thresholds, const float score_threshold)
   {
+    cloud_capacity_ = cloud_capacity;
+
     if (voxels_num.size() == 3) {
       max_voxels_ = voxels_num[2];
 
@@ -61,6 +64,9 @@ class TransfusionConfig
       voxel_y_size_ = static_cast<float>(voxel_size[1]);
       voxel_z_size_ = static_cast<float>(voxel_size[2]);
     }
+    if (num_proposals > 0) {
+      num_proposals_ = num_proposals;
+    }
     if (score_threshold > 0.0) {
       score_threshold_ = score_threshold;
     }
@@ -79,7 +85,7 @@ class TransfusionConfig
   }
 
   ///// INPUT PARAMETERS /////
-  const std::size_t cloud_capacity_{1000000};
+  std::size_t cloud_capacity_{};
   ///// KERNEL PARAMETERS /////
   const std::size_t threads_for_voxel_{256};  // threads number for a block
   const std::size_t points_per_voxel_{20};
@@ -107,7 +113,7 @@ class TransfusionConfig
   const std::size_t out_size_factor_{4};
   const std::size_t max_num_points_per_pillar_{points_per_voxel_};
   const std::size_t num_point_values_{4};
-  const std::size_t num_proposals_{200};
+  std::size_t num_proposals_{200};
   // the number of feature maps for pillar scatter
   const std::size_t num_feature_scatter_{pillar_feature_size_};
   // the score threshold for classification
diff --git a/perception/lidar_transfusion/schema/transfusion.schema.json b/perception/lidar_transfusion/schema/transfusion.schema.json
index 41d8d887236a8..0bbbdc888a261 100644
--- a/perception/lidar_transfusion/schema/transfusion.schema.json
+++ b/perception/lidar_transfusion/schema/transfusion.schema.json
@@ -21,6 +21,12 @@
           "default": "fp16",
           "enum": ["fp16", "fp32"]
         },
+        "cloud_capacity": {
+          "type": "integer",
+          "description": "Capacity of the point cloud buffer (should be set to at least the maximum theoretical number of points).",
+          "default": 2000000,
+          "minimum": 1
+        },
         "voxels_num": {
           "type": "array",
           "items": {
diff --git a/perception/lidar_transfusion/src/lidar_transfusion_node.cpp b/perception/lidar_transfusion/src/lidar_transfusion_node.cpp
index e3ea6b3780de8..a07e385208748 100644
--- a/perception/lidar_transfusion/src/lidar_transfusion_node.cpp
+++ b/perception/lidar_transfusion/src/lidar_transfusion_node.cpp
@@ -27,10 +27,14 @@ LidarTransfusionNode::LidarTransfusionNode(const rclcpp::NodeOptions & options)
   class_names_ = this->declare_parameter<std::vector<std::string>>("class_names", descriptor);
   const std::string trt_precision =
     this->declare_parameter<std::string>("trt_precision", descriptor);
+  const std::size_t cloud_capacity =
+    this->declare_parameter<std::int64_t>("cloud_capacity", descriptor);
   const auto voxels_num = this->declare_parameter<std::vector<int64_t>>("voxels_num", descriptor);
   const auto point_cloud_range =
     this->declare_parameter<std::vector<double>>("point_cloud_range", descriptor);
   const auto voxel_size = this->declare_parameter<std::vector<double>>("voxel_size", descriptor);
+  const std::size_t num_proposals =
+    this->declare_parameter<std::int64_t>("num_proposals", descriptor);
   const std::string onnx_path = this->declare_parameter<std::string>("onnx_path", descriptor);
   const std::string engine_path = this->declare_parameter<std::string>("engine_path", descriptor);
 
@@ -73,8 +77,8 @@ LidarTransfusionNode::LidarTransfusionNode(const rclcpp::NodeOptions & options)
   DensificationParam densification_param(
     densification_world_frame_id, densification_num_past_frames);
   TransfusionConfig config(
-    voxels_num, point_cloud_range, voxel_size, circle_nms_dist_threshold, yaw_norm_thresholds,
-    score_threshold);
+    cloud_capacity, voxels_num, point_cloud_range, voxel_size, num_proposals,
+    circle_nms_dist_threshold, yaw_norm_thresholds, score_threshold);
 
   const auto allow_remapping_by_area_matrix =
     this->declare_parameter<std::vector<int64_t>>("allow_remapping_by_area_matrix", descriptor);