From a7a4e37089fa139144766ac655fb6c76eb84aee6 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Tue, 5 Dec 2023 17:53:18 +0800
Subject: [PATCH] refactor init

---
 .gitignore                                    |   1 +
 configs/_base_/datasets/waymoD5-3d-3class.py  |  17 +-
 configs/_base_/datasets/waymoD5-3d-car.py     |  15 +-
 mmdet3d/datasets/det3d_dataset.py             |  17 +-
 mmdet3d/datasets/waymo_dataset.py             | 117 ++-
 mmdet3d/engine/hooks/visualization_hook.py    |   4 +-
 .../waymo_utils/prediction_to_waymo.py        | 367 ++-------
 mmdet3d/evaluation/metrics/waymo_metric.py    | 634 ++++------------
 ...-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py |  13 +-
 tools/create_data.py                          | 134 ++--
 tools/create_data.sh                          |   7 +-
 .../dataset_converters/create_gt_database.py  |  28 +-
 tools/dataset_converters/waymo_converter.py   | 696 ++++++++++--------
 13 files changed, 802 insertions(+), 1248 deletions(-)

diff --git a/.gitignore b/.gitignore
index 27cb9c7cb4..2fefc6a904 100644
--- a/.gitignore
+++ b/.gitignore
@@ -134,3 +134,4 @@ data/sunrgbd/OFFICIAL_SUNRGBD/
 # Waymo evaluation
 mmdet3d/evaluation/functional/waymo_utils/compute_detection_metrics_main
 mmdet3d/evaluation/functional/waymo_utils/compute_detection_let_metrics_main
+mmdet3d/evaluation/functional/waymo_utils/compute_segmentation_metrics_main
diff --git a/configs/_base_/datasets/waymoD5-3d-3class.py b/configs/_base_/datasets/waymoD5-3d-3class.py
index e5240b629e..f8f14998d2 100644
--- a/configs/_base_/datasets/waymoD5-3d-3class.py
+++ b/configs/_base_/datasets/waymoD5-3d-3class.py
@@ -89,7 +89,10 @@
             dict(
                 type='PointsRangeFilter', point_cloud_range=point_cloud_range)
         ]),
-    dict(type='Pack3DDetInputs', keys=['points'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 # construct a pipeline for data and gt loading in show function
 # please keep its loading function consistent with test_pipeline (e.g. client)
@@ -100,7 +103,10 @@
         load_dim=6,
         use_dim=5,
         backend_args=backend_args),
-    dict(type='Pack3DDetInputs', keys=['points']),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 
 train_dataloader = dict(
@@ -164,12 +170,7 @@
         backend_args=backend_args))
 
 val_evaluator = dict(
-    type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-    data_root='./data/waymo/waymo_format',
-    backend_args=backend_args,
-    convert_kitti_format=False)
+    type='WaymoMetric', waymo_bin_file='./data/waymo/waymo_format/gt.bin')
 test_evaluator = val_evaluator
 
 vis_backends = [dict(type='LocalVisBackend')]
diff --git a/configs/_base_/datasets/waymoD5-3d-car.py b/configs/_base_/datasets/waymoD5-3d-car.py
index f95ac1d817..972e9289be 100644
--- a/configs/_base_/datasets/waymoD5-3d-car.py
+++ b/configs/_base_/datasets/waymoD5-3d-car.py
@@ -62,7 +62,8 @@
     dict(type='PointShuffle'),
     dict(
         type='Pack3DDetInputs',
-        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 test_pipeline = [
     dict(
@@ -86,7 +87,10 @@
             dict(
                 type='PointsRangeFilter', point_cloud_range=point_cloud_range)
         ]),
-    dict(type='Pack3DDetInputs', keys=['points'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 # construct a pipeline for data and gt loading in show function
 # please keep its loading function consistent with test_pipeline (e.g. client)
@@ -161,12 +165,7 @@
         backend_args=backend_args))
 
 val_evaluator = dict(
-    type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-    data_root='./data/waymo/waymo_format',
-    convert_kitti_format=False,
-    backend_args=backend_args)
+    type='WaymoMetric', waymo_bin_file='./data/waymo/waymo_format/gt.bin')
 test_evaluator = val_evaluator
 
 vis_backends = [dict(type='LocalVisBackend')]
diff --git a/mmdet3d/datasets/det3d_dataset.py b/mmdet3d/datasets/det3d_dataset.py
index 11caae4729..c701a893fd 100644
--- a/mmdet3d/datasets/det3d_dataset.py
+++ b/mmdet3d/datasets/det3d_dataset.py
@@ -113,7 +113,7 @@ def __init__(self,
                 ori_label = self.METAINFO['classes'].index(name)
                 self.label_mapping[ori_label] = label_idx
 
-            self.num_ins_per_cat = {name: 0 for name in metainfo['classes']}
+            self.num_ins_per_cat = [0] * len(metainfo['classes'])
         else:
             self.label_mapping = {
                 i: i
@@ -121,10 +121,7 @@ def __init__(self,
             }
             self.label_mapping[-1] = -1
 
-            self.num_ins_per_cat = {
-                name: 0
-                for name in self.METAINFO['classes']
-            }
+            self.num_ins_per_cat = [0] * len(self.METAINFO['classes'])
 
         super().__init__(
             ann_file=ann_file,
@@ -146,9 +143,12 @@ def __init__(self,
 
             # show statistics of this dataset
             print_log('-' * 30, 'current')
-            print_log(f'The length of the dataset: {len(self)}', 'current')
+            print_log(
+                f'The length of {"test" if self.test_mode else "training"} dataset: {len(self)}',  # noqa: E501
+                'current')
             content_show = [['category', 'number']]
-            for cat_name, num in self.num_ins_per_cat.items():
+            for label, num in enumerate(self.num_ins_per_cat):
+                cat_name = self.metainfo['classes'][label]
                 content_show.append([cat_name, num])
             table = AsciiTable(content_show)
             print_log(
@@ -256,8 +256,7 @@ def parse_ann_info(self, info: dict) -> Union[dict, None]:
 
             for label in ann_info['gt_labels_3d']:
                 if label != -1:
-                    cat_name = self.metainfo['classes'][label]
-                    self.num_ins_per_cat[cat_name] += 1
+                    self.num_ins_per_cat[label] += 1
 
         return ann_info
 
diff --git a/mmdet3d/datasets/waymo_dataset.py b/mmdet3d/datasets/waymo_dataset.py
index 5b3a83824e..cda27e42e5 100644
--- a/mmdet3d/datasets/waymo_dataset.py
+++ b/mmdet3d/datasets/waymo_dataset.py
@@ -3,9 +3,11 @@
 from typing import Callable, List, Union
 
 import numpy as np
+from mmengine import print_log
+from mmengine.fileio import load
 
 from mmdet3d.registry import DATASETS
-from mmdet3d.structures import CameraInstance3DBoxes
+from mmdet3d.structures import CameraInstance3DBoxes, LiDARInstance3DBoxes
 from .det3d_dataset import Det3DDataset
 from .kitti_dataset import KittiDataset
 
@@ -163,13 +165,10 @@ def parse_ann_info(self, info: dict) -> dict:
             centers_2d = np.zeros((0, 2), dtype=np.float32)
             depths = np.zeros((0), dtype=np.float32)
 
-        # in waymo, lidar2cam = R0_rect @ Tr_velo_to_cam
-        # convert gt_bboxes_3d to velodyne coordinates with `lidar2cam`
-        lidar2cam = np.array(info['images'][self.default_cam_key]['lidar2cam'])
-        gt_bboxes_3d = CameraInstance3DBoxes(
-            ann_info['gt_bboxes_3d']).convert_to(self.box_mode_3d,
-                                                 np.linalg.inv(lidar2cam))
-        ann_info['gt_bboxes_3d'] = gt_bboxes_3d
+        if self.load_type == 'frame_based':
+            gt_bboxes_3d = LiDARInstance3DBoxes(ann_info['gt_bboxes_3d'])
+        else:
+            gt_bboxes_3d = CameraInstance3DBoxes(ann_info['gt_bboxes_3d'])
 
         anns_results = dict(
             gt_bboxes_3d=gt_bboxes_3d,
@@ -182,9 +181,58 @@ def parse_ann_info(self, info: dict) -> dict:
         return anns_results
 
     def load_data_list(self) -> List[dict]:
-        """Add the load interval."""
-        data_list = super().load_data_list()
-        data_list = data_list[::self.load_interval]
+        """Add the load interval.
+
+        Returns:
+            list[dict]: A list of annotation.
+        """  # noqa: E501
+        # `self.ann_file` denotes the absolute annotation file path if
+        # `self.root=None` or relative path if `self.root=/path/to/data/`.
+        annotations = load(self.ann_file)
+        if not isinstance(annotations, dict):
+            raise TypeError(f'The annotations loaded from annotation file '
+                            f'should be a dict, but got {type(annotations)}!')
+        if 'data_list' not in annotations or 'metainfo' not in annotations:
+            raise ValueError('Annotation must have data_list and metainfo '
+                             'keys')
+        metainfo = annotations['metainfo']
+        raw_data_list = annotations['data_list']
+        raw_data_list = raw_data_list[::self.load_interval]
+        if self.load_interval > 1:
+            print_log(
+                f'Sample size will be reduced to 1/{self.load_interval} of'
+                ' the original data sample',
+                logger='current')
+
+        # Meta information load from annotation file will not influence the
+        # existed meta information load from `BaseDataset.METAINFO` and
+        # `metainfo` arguments defined in constructor.
+        for k, v in metainfo.items():
+            self._metainfo.setdefault(k, v)
+
+        # load and parse data_infos.
+        data_list = []
+        for raw_data_info in raw_data_list:
+            # parse raw data information to target format
+            data_info = self.parse_data_info(raw_data_info)
+            if isinstance(data_info, dict):
+                # For image tasks, `data_info` should information if single
+                # image, such as dict(img_path='xxx', width=360, ...)
+                data_list.append(data_info)
+            elif isinstance(data_info, list):
+                # For video tasks, `data_info` could contain image
+                # information of multiple frames, such as
+                # [dict(video_path='xxx', timestamps=...),
+                #  dict(video_path='xxx', timestamps=...)]
+                for item in data_info:
+                    if not isinstance(item, dict):
+                        raise TypeError('data_info must be list of dict, but '
+                                        f'got {type(item)}')
+                data_list.extend(data_info)
+            else:
+                raise TypeError('data_info should be a dict or list of dict, '
+                                f'but got {type(data_info)}')
+
         return data_list
 
     def parse_data_info(self, info: dict) -> Union[dict, List[dict]]:
@@ -203,44 +251,39 @@ def parse_data_info(self, info: dict) -> Union[dict, List[dict]]:
                 info['images'][self.default_cam_key]
             info['images'] = new_image_info
             info['instances'] = info['cam_instances'][self.default_cam_key]
-            return super().parse_data_info(info)
+            return Det3DDataset.parse_data_info(self, info)
         else:
             # in the mono3d, the instances is from cam sync.
+            # Convert frame-based infos to multi-view image-based
             data_list = []
-            if self.modality['use_lidar']:
-                info['lidar_points']['lidar_path'] =  \
-                    osp.join(
-                        self.data_prefix.get('pts', ''),
-                        info['lidar_points']['lidar_path'])
-
-            if self.modality['use_camera']:
-                for cam_key, img_info in info['images'].items():
-                    if 'img_path' in img_info:
-                        cam_prefix = self.data_prefix.get(cam_key, '')
-                        img_info['img_path'] = osp.join(
-                            cam_prefix, img_info['img_path'])
-
             for (cam_key, img_info) in info['images'].items():
                 camera_info = dict()
+                camera_info['sample_idx'] = info['sample_idx']
+                camera_info['timestamp'] = info['timestamp']
+                camera_info['context_name'] = info['context_name']
                 camera_info['images'] = dict()
                 camera_info['images'][cam_key] = img_info
-                if 'cam_instances' in info \
-                        and cam_key in info['cam_instances']:
-                    camera_info['instances'] = info['cam_instances'][cam_key]
+                if 'img_path' in img_info:
+                    cam_prefix = self.data_prefix.get(cam_key, '')
+                    camera_info['images'][cam_key]['img_path'] = osp.join(
+                        cam_prefix, img_info['img_path'])
+                if 'lidar2cam' in img_info:
+                    camera_info['lidar2cam'] = np.array(img_info['lidar2cam'])
+                if 'cam2img' in img_info:
+                    camera_info['cam2img'] = np.array(img_info['cam2img'])
+                if 'lidar2img' in img_info:
+                    camera_info['lidar2img'] = np.array(img_info['lidar2img'])
                 else:
-                    camera_info['instances'] = []
-                camera_info['ego2global'] = info['ego2global']
-                if 'image_sweeps' in info:
-                    camera_info['image_sweeps'] = info['image_sweeps']
-
-                # TODO check if need to modify the sample id
-                # TODO check when will use it except for evaluation.
-                camera_info['sample_idx'] = info['sample_idx']
+                    camera_info['lidar2img'] = camera_info[
+                        'cam2img'] @ camera_info['lidar2cam']
 
                 if not self.test_mode:
                     # used in training
+                    camera_info['instances'] = info['cam_instances'][cam_key]
                     camera_info['ann_info'] = self.parse_ann_info(camera_info)
                 if self.test_mode and self.load_eval_anns:
-                    info['eval_ann_info'] = self.parse_ann_info(info)
+                    camera_info['instances'] = info['cam_instances'][cam_key]
+                    camera_info['eval_ann_info'] = self.parse_ann_info(
+                        camera_info)
                 data_list.append(camera_info)
             return data_list
diff --git a/mmdet3d/engine/hooks/visualization_hook.py b/mmdet3d/engine/hooks/visualization_hook.py
index ffec1addc3..9de46d9692 100644
--- a/mmdet3d/engine/hooks/visualization_hook.py
+++ b/mmdet3d/engine/hooks/visualization_hook.py
@@ -78,11 +78,11 @@ def __init__(self,
                           'needs to be excluded.')
         self.vis_task = vis_task
 
-        if wait_time == -1:
+        if show and wait_time == -1:
             print_log(
                 'Manual control mode, press [Right] to next sample.',
                 logger='current')
-        else:
+        elif show:
             print_log(
                 'Autoplay mode, press [SPACE] to pause.', logger='current')
         self.wait_time = wait_time
diff --git a/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py b/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
index b9da8043d2..c1729e7b89 100644
--- a/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
+++ b/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
@@ -4,7 +4,6 @@
 """
 
 try:
-    from waymo_open_dataset import dataset_pb2 as open_dataset
     from waymo_open_dataset import label_pb2
     from waymo_open_dataset.protos import metrics_pb2
     from waymo_open_dataset.protos.metrics_pb2 import Objects
@@ -14,13 +13,10 @@
         'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" '
         'to install the official devkit first.')
 
-from glob import glob
-from os.path import join
-from typing import List, Optional
+from typing import List
 
 import mmengine
-import numpy as np
-import tensorflow as tf
+from mmengine import print_log
 
 
 class Prediction2Waymo(object):
@@ -32,54 +28,22 @@ class Prediction2Waymo(object):
 
     Args:
         results (list[dict]): Prediction results.
-        waymo_tfrecords_dir (str): Directory to load waymo raw data.
         waymo_results_save_dir (str): Directory to save converted predictions
             in waymo format (.bin files).
         waymo_results_final_path (str): Path to save combined
             predictions in waymo format (.bin file), like 'a/b/c.bin'.
-        prefix (str): Prefix of filename. In general, 0 for training, 1 for
-            validation and 2 for testing.
-        classes (dict): A list of class name.
-        workers (str): Number of parallel processes. Defaults to 2.
-        backend_args (dict, optional): Arguments to instantiate the
-            corresponding backend. Defaults to None.
-        from_kitti_format (bool, optional): Whether the reuslts are kitti
-            format. Defaults to False.
-        idx2metainfo (Optional[dict], optional): The mapping from sample_idx to
-            metainfo. The metainfo must contain the keys: 'idx2contextname' and
-            'idx2timestamp'. Defaults to None.
+        num_workers (str): Number of parallel processes. Defaults to 4.
     """
 
     def __init__(self,
                  results: List[dict],
-                 waymo_tfrecords_dir: str,
-                 waymo_results_save_dir: str,
                  waymo_results_final_path: str,
-                 prefix: str,
                  classes: dict,
-                 workers: int = 2,
-                 backend_args: Optional[dict] = None,
-                 from_kitti_format: bool = False,
-                 idx2metainfo: Optional[dict] = None):
-
+                 num_workers: int = 4):
         self.results = results
-        self.waymo_tfrecords_dir = waymo_tfrecords_dir
-        self.waymo_results_save_dir = waymo_results_save_dir
         self.waymo_results_final_path = waymo_results_final_path
-        self.prefix = prefix
         self.classes = classes
-        self.workers = int(workers)
-        self.backend_args = backend_args
-        self.from_kitti_format = from_kitti_format
-        if idx2metainfo is not None:
-            self.idx2metainfo = idx2metainfo
-            # If ``fast_eval``, the metainfo does not need to be read from
-            # original data online. It's preprocessed offline.
-            self.fast_eval = True
-        else:
-            self.fast_eval = False
-
-        self.name2idx = {}
+        self.num_workers = num_workers
 
         self.k2w_cls_map = {
             'Car': label_pb2.Label.TYPE_VEHICLE,
@@ -88,193 +52,7 @@ def __init__(self,
             'Cyclist': label_pb2.Label.TYPE_CYCLIST,
         }
 
-        if self.from_kitti_format:
-            self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0],
-                                                [-1.0, 0.0, 0.0, 0.0],
-                                                [0.0, -1.0, 0.0, 0.0],
-                                                [0.0, 0.0, 0.0, 1.0]])
-            # ``sample_idx`` of the sample in kitti-format is an array
-            for idx, result in enumerate(results):
-                if len(result['sample_idx']) > 0:
-                    self.name2idx[str(result['sample_idx'][0])] = idx
-        else:
-            # ``sample_idx`` of the sample in the original prediction
-            # is an int value.
-            for idx, result in enumerate(results):
-                self.name2idx[str(result['sample_idx'])] = idx
-
-        if not self.fast_eval:
-            # need to read original '.tfrecord' file
-            self.get_file_names()
-            # turn on eager execution for older tensorflow versions
-            if int(tf.__version__.split('.')[0]) < 2:
-                tf.enable_eager_execution()
-
-        self.create_folder()
-
-    def get_file_names(self):
-        """Get file names of waymo raw data."""
-        if 'path_mapping' in self.backend_args:
-            for path in self.backend_args['path_mapping'].keys():
-                if path in self.waymo_tfrecords_dir:
-                    self.waymo_tfrecords_dir = \
-                        self.waymo_tfrecords_dir.replace(
-                            path, self.backend_args['path_mapping'][path])
-            from petrel_client.client import Client
-            client = Client()
-            contents = client.list(self.waymo_tfrecords_dir)
-            self.waymo_tfrecord_pathnames = list()
-            for content in sorted(list(contents)):
-                if content.endswith('tfrecord'):
-                    self.waymo_tfrecord_pathnames.append(
-                        join(self.waymo_tfrecords_dir, content))
-        else:
-            self.waymo_tfrecord_pathnames = sorted(
-                glob(join(self.waymo_tfrecords_dir, '*.tfrecord')))
-        print(len(self.waymo_tfrecord_pathnames), 'tfrecords found.')
-
-    def create_folder(self):
-        """Create folder for data conversion."""
-        mmengine.mkdir_or_exist(self.waymo_results_save_dir)
-
-    def parse_objects(self, kitti_result, T_k2w, context_name,
-                      frame_timestamp_micros):
-        """Parse one prediction with several instances in kitti format and
-        convert them to `Object` proto.
-
-        Args:
-            kitti_result (dict): Predictions in kitti format.
-
-                - name (np.ndarray): Class labels of predictions.
-                - dimensions (np.ndarray): Height, width, length of boxes.
-                - location (np.ndarray): Bottom center of boxes (x, y, z).
-                - rotation_y (np.ndarray): Orientation of boxes.
-                - score (np.ndarray): Scores of predictions.
-            T_k2w (np.ndarray): Transformation matrix from kitti to waymo.
-            context_name (str): Context name of the frame.
-            frame_timestamp_micros (int): Frame timestamp.
-
-        Returns:
-            :obj:`Object`: Predictions in waymo dataset Object proto.
-        """
-
-        def parse_one_object(instance_idx):
-            """Parse one instance in kitti format and convert them to `Object`
-            proto.
-
-            Args:
-                instance_idx (int): Index of the instance to be converted.
-
-            Returns:
-                :obj:`Object`: Predicted instance in waymo dataset
-                    Object proto.
-            """
-            cls = kitti_result['name'][instance_idx]
-            length = round(kitti_result['dimensions'][instance_idx, 0], 4)
-            height = round(kitti_result['dimensions'][instance_idx, 1], 4)
-            width = round(kitti_result['dimensions'][instance_idx, 2], 4)
-            x = round(kitti_result['location'][instance_idx, 0], 4)
-            y = round(kitti_result['location'][instance_idx, 1], 4)
-            z = round(kitti_result['location'][instance_idx, 2], 4)
-            rotation_y = round(kitti_result['rotation_y'][instance_idx], 4)
-            score = round(kitti_result['score'][instance_idx], 4)
-
-            # y: downwards; move box origin from bottom center (kitti) to
-            # true center (waymo)
-            y -= height / 2
-            # frame transformation: kitti -> waymo
-            x, y, z = self.transform(T_k2w, x, y, z)
-
-            # different conventions
-            heading = -(rotation_y + np.pi / 2)
-            while heading < -np.pi:
-                heading += 2 * np.pi
-            while heading > np.pi:
-                heading -= 2 * np.pi
-
-            box = label_pb2.Label.Box()
-            box.center_x = x
-            box.center_y = y
-            box.center_z = z
-            box.length = length
-            box.width = width
-            box.height = height
-            box.heading = heading
-
-            o = metrics_pb2.Object()
-            o.object.box.CopyFrom(box)
-            o.object.type = self.k2w_cls_map[cls]
-            o.score = score
-
-            o.context_name = context_name
-            o.frame_timestamp_micros = frame_timestamp_micros
-
-            return o
-
-        objects = metrics_pb2.Objects()
-
-        for instance_idx in range(len(kitti_result['name'])):
-            o = parse_one_object(instance_idx)
-            objects.objects.append(o)
-
-        return objects
-
-    def convert_one(self, file_idx):
-        """Convert action for single file.
-
-        Args:
-            file_idx (int): Index of the file to be converted.
-        """
-        file_pathname = self.waymo_tfrecord_pathnames[file_idx]
-        if 's3://' in file_pathname and tf.__version__ >= '2.6.0':
-            try:
-                import tensorflow_io as tfio  # noqa: F401
-            except ImportError:
-                raise ImportError(
-                    "Please run 'pip install tensorflow-io' to install tensorflow_io first."  # noqa: E501
-                )
-        file_data = tf.data.TFRecordDataset(file_pathname, compression_type='')
-
-        for frame_num, frame_data in enumerate(file_data):
-            frame = open_dataset.Frame()
-            frame.ParseFromString(bytearray(frame_data.numpy()))
-
-            filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}'
-
-            context_name = frame.context.name
-            frame_timestamp_micros = frame.timestamp_micros
-
-            if filename in self.name2idx:
-                if self.from_kitti_format:
-                    for camera in frame.context.camera_calibrations:
-                        # FRONT = 1, see dataset.proto for details
-                        if camera.name == 1:
-                            T_front_cam_to_vehicle = np.array(
-                                camera.extrinsic.transform).reshape(4, 4)
-
-                    T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam
-
-                    kitti_result = \
-                        self.results[self.name2idx[filename]]
-                    objects = self.parse_objects(kitti_result, T_k2w,
-                                                 context_name,
-                                                 frame_timestamp_micros)
-                else:
-                    index = self.name2idx[filename]
-                    objects = self.parse_objects_from_origin(
-                        self.results[index], context_name,
-                        frame_timestamp_micros)
-
-            else:
-                print(filename, 'not found.')
-                objects = metrics_pb2.Objects()
-
-            with open(
-                    join(self.waymo_results_save_dir, f'{filename}.bin'),
-                    'wb') as f:
-                f.write(objects.SerializeToString())
-
-    def convert_one_fast(self, res_index: int):
+    def convert_one(self, res_index: int):
         """Convert action for single file. It read the metainfo from the
         preprocessed file offline and will be faster.
 
@@ -282,19 +60,16 @@ def convert_one_fast(self, res_index: int):
             res_index (int): The indices of the results.
         """
         sample_idx = self.results[res_index]['sample_idx']
-        if len(self.results[res_index]['pred_instances_3d']) > 0:
+        if len(self.results[res_index]['labels_3d']) > 0:
             objects = self.parse_objects_from_origin(
                 self.results[res_index],
-                self.idx2metainfo[str(sample_idx)]['contextname'],
-                self.idx2metainfo[str(sample_idx)]['timestamp'])
+                self.results[res_index]['context_name'],
+                self.results[res_index]['timestamp'])
         else:
             print(sample_idx, 'not found.')
             objects = metrics_pb2.Objects()
 
-        with open(
-                join(self.waymo_results_save_dir, f'{sample_idx}.bin'),
-                'wb') as f:
-            f.write(objects.SerializeToString())
+        return objects
 
     def parse_objects_from_origin(self, result: dict, contextname: str,
                                   timestamp: str) -> Objects:
@@ -308,112 +83,56 @@ def parse_objects_from_origin(self, result: dict, contextname: str,
         Returns:
             metrics_pb2.Objects: The parsed object.
         """
-        lidar_boxes = result['pred_instances_3d']['bboxes_3d'].tensor
-        scores = result['pred_instances_3d']['scores_3d']
-        labels = result['pred_instances_3d']['labels_3d']
-
-        def parse_one_object(index):
-            class_name = self.classes[labels[index].item()]
+        lidar_boxes = result['bboxes_3d']
+        scores = result['scores_3d']
+        labels = result['labels_3d']
 
+        objects = metrics_pb2.Objects()
+        for lidar_box, score, label in zip(lidar_boxes, scores, labels):
+            # Parse one object
             box = label_pb2.Label.Box()
-            height = lidar_boxes[index][5].item()
-            heading = lidar_boxes[index][6].item()
-
-            while heading < -np.pi:
-                heading += 2 * np.pi
-            while heading > np.pi:
-                heading -= 2 * np.pi
-
-            box.center_x = lidar_boxes[index][0].item()
-            box.center_y = lidar_boxes[index][1].item()
-            box.center_z = lidar_boxes[index][2].item() + height / 2
-            box.length = lidar_boxes[index][3].item()
-            box.width = lidar_boxes[index][4].item()
+            height = lidar_box[5]
+            heading = lidar_box[6]
+
+            box.center_x = lidar_box[0]
+            box.center_y = lidar_box[1]
+            box.center_z = lidar_box[2] + height / 2
+            box.length = lidar_box[3]
+            box.width = lidar_box[4]
             box.height = height
             box.heading = heading
 
-            o = metrics_pb2.Object()
-            o.object.box.CopyFrom(box)
-            o.object.type = self.k2w_cls_map[class_name]
-            o.score = scores[index].item()
-            o.context_name = contextname
-            o.frame_timestamp_micros = timestamp
+            object = metrics_pb2.Object()
+            object.object.box.CopyFrom(box)
 
-            return o
-
-        objects = metrics_pb2.Objects()
-        for i in range(len(lidar_boxes)):
-            objects.objects.append(parse_one_object(i))
+            class_name = self.classes[label]
+            object.object.type = self.k2w_cls_map[class_name]
+            object.score = score
+            object.context_name = contextname
+            object.frame_timestamp_micros = timestamp
+            objects.objects.append(object)
 
         return objects
 
     def convert(self):
         """Convert action."""
-        print('Start converting ...')
-        convert_func = self.convert_one_fast if self.fast_eval else \
-            self.convert_one
+        print_log('Start converting ...', logger='current')
 
-        # from torch.multiprocessing import set_sharing_strategy
-        # # Force using "file_system" sharing strategy for stability
-        # set_sharing_strategy("file_system")
+        # TODO: use parallel processes.
+        # objects_list = mmengine.track_parallel_progress(
+        #     self.convert_one, range(len(self)), self.num_workers)
 
-        # mmengine.track_parallel_progress(convert_func, range(len(self)),
-        #                                  self.workers)
+        objects_list = mmengine.track_progress(self.convert_one,
+                                               range(len(self)))
 
-        # TODO: Support multiprocessing. Now, multiprocessing evaluation will
-        # cause shared memory error in torch-1.10 and torch-1.11. Details can
-        # be seen in https://github.com/pytorch/pytorch/issues/67864.
-        prog_bar = mmengine.ProgressBar(len(self))
-        for i in range(len(self)):
-            convert_func(i)
-            prog_bar.update()
-
-        print('\nFinished ...')
-
-        # combine all files into one .bin
-        pathnames = sorted(glob(join(self.waymo_results_save_dir, '*.bin')))
-        combined = self.combine(pathnames)
+        combined = metrics_pb2.Objects()
+        for objects in objects_list:
+            for o in objects.objects:
+                combined.objects.append(o)
 
         with open(self.waymo_results_final_path, 'wb') as f:
             f.write(combined.SerializeToString())
 
     def __len__(self):
         """Length of the filename list."""
-        return len(self.results) if self.fast_eval else len(
-            self.waymo_tfrecord_pathnames)
-
-    def transform(self, T, x, y, z):
-        """Transform the coordinates with matrix T.
-
-        Args:
-            T (np.ndarray): Transformation matrix.
-            x(float): Coordinate in x axis.
-            y(float): Coordinate in y axis.
-            z(float): Coordinate in z axis.
-
-        Returns:
-            list: Coordinates after transformation.
-        """
-        pt_bef = np.array([x, y, z, 1.0]).reshape(4, 1)
-        pt_aft = np.matmul(T, pt_bef)
-        return pt_aft[:3].flatten().tolist()
-
-    def combine(self, pathnames):
-        """Combine predictions in waymo format for each sample together.
-
-        Args:
-            pathnames (str): Paths to save predictions.
-
-        Returns:
-            :obj:`Objects`: Combined predictions in Objects proto.
-        """
-        combined = metrics_pb2.Objects()
-
-        for pathname in pathnames:
-            objects = metrics_pb2.Objects()
-            with open(pathname, 'rb') as f:
-                objects.ParseFromString(f.read())
-            for o in objects.objects:
-                combined.objects.append(o)
-
-        return combined
+        return len(self.results)
diff --git a/mmdet3d/evaluation/metrics/waymo_metric.py b/mmdet3d/evaluation/metrics/waymo_metric.py
index 0dd69a5c24..41fe429ba8 100644
--- a/mmdet3d/evaluation/metrics/waymo_metric.py
+++ b/mmdet3d/evaluation/metrics/waymo_metric.py
@@ -1,54 +1,29 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import tempfile
 from os import path as osp
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Sequence, Tuple, Union
 
-import mmengine
 import numpy as np
 import torch
-from mmengine import Config, load
+from mmengine import Config
+from mmengine.evaluator import BaseMetric
 from mmengine.logging import MMLogger, print_log
 
 from mmdet3d.models.layers import box3d_multiclass_nms
 from mmdet3d.registry import METRICS
 from mmdet3d.structures import (Box3DMode, CameraInstance3DBoxes,
-                                LiDARInstance3DBoxes, bbox3d2result,
-                                points_cam2img, xywhr2xyxyr)
-from .kitti_metric import KittiMetric
+                                LiDARInstance3DBoxes, points_cam2img,
+                                xywhr2xyxyr)
 
 
 @METRICS.register_module()
-class WaymoMetric(KittiMetric):
+class WaymoMetric(BaseMetric):
     """Waymo evaluation metric.
 
     Args:
-        ann_file (str): The path of the annotation file in kitti format.
         waymo_bin_file (str): The path of the annotation file in waymo format.
-        data_root (str): Path of dataset root. Used for storing waymo
-            evaluation programs.
-        split (str): The split of the evaluation set. Defaults to 'training'.
         metric (str or List[str]): Metrics to be evaluated. Defaults to 'mAP'.
-        pcd_limit_range (List[float]): The range of point cloud used to filter
-            invalid predicted boxes. Defaults to [-85, -85, -5, 85, 85, 5].
-        convert_kitti_format (bool): Whether to convert the results to kitti
-            format. Now, in order to be compatible with camera-based methods,
-            defaults to True.
-        prefix (str, optional): The prefix that will be added in the metric
-            names to disambiguate homonymous metrics of different evaluators.
-            If prefix is not provided in the argument, self.default_prefix will
-            be used instead. Defaults to None.
-        format_only (bool): Format the output results without perform
-            evaluation. It is useful when you want to format the result to a
-            specific format and submit it to the test server.
-            Defaults to False.
-        pklfile_prefix (str, optional): The prefix of pkl files, including the
-            file path and the prefix of filename, e.g., "a/b/prefix". If not
-            specified, a temp file will be created. Defaults to None.
-        submission_prefix (str, optional): The prefix of submission data. If
-            not specified, the submission data will not be generated.
-            Defaults to None.
         load_type (str): Type of loading mode during training.
-
             - 'frame_based': Load all of the instances in the frame.
             - 'mv_image_based': Load all of the instances in the frame and need
               to convert to the FOV-based data type to support image-based
@@ -56,73 +31,90 @@ class WaymoMetric(KittiMetric):
             - 'fov_image_based': Only load the instances inside the default cam
               and need to convert to the FOV-based data type to support image-
               based detector.
-        default_cam_key (str): The default camera for lidar to camera
-            conversion. By default, KITTI: 'CAM2', Waymo: 'CAM_FRONT'.
-            Defaults to 'CAM_FRONT'.
-        use_pred_sample_idx (bool): In formating results, use the sample index
-            from the prediction or from the load annotations. By default,
-            KITTI: True, Waymo: False, Waymo has a conversion process, which
-            needs to use the sample idx from load annotation.
-            Defaults to False.
-        collect_device (str): Device name used for collecting results from
-            different ranks during distributed training. Must be 'cpu' or
-            'gpu'. Defaults to 'cpu'.
-        backend_args (dict, optional): Arguments to instantiate the
-            corresponding backend. Defaults to None.
-        idx2metainfo (str, optional): The file path of the metainfo in waymo.
-            It stores the mapping from sample_idx to metainfo. The metainfo
-            must contain the keys: 'idx2contextname' and 'idx2timestamp'.
+        result_prefix (str, optional): The prefix of result '*.bin' file,
+            including the file path and the prefix of filename, e.g.,
+            "a/b/prefix". If not specified, a temp file will be created.
             Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result to a
+            specific format and submit it to the test server.
+            Defaults to False.
     """
     num_cams = 5
+    default_prefix = 'Waymo metric'
 
     def __init__(self,
-                 ann_file: str,
                  waymo_bin_file: str,
-                 data_root: str,
-                 split: str = 'training',
                  metric: Union[str, List[str]] = 'mAP',
-                 pcd_limit_range: List[float] = [-85, -85, -5, 85, 85, 5],
-                 convert_kitti_format: bool = True,
-                 prefix: Optional[str] = None,
-                 format_only: bool = False,
-                 pklfile_prefix: Optional[str] = None,
-                 submission_prefix: Optional[str] = None,
                  load_type: str = 'frame_based',
-                 default_cam_key: str = 'CAM_FRONT',
-                 use_pred_sample_idx: bool = False,
-                 collect_device: str = 'cpu',
-                 backend_args: Optional[dict] = None,
-                 idx2metainfo: Optional[str] = None) -> None:
+                 result_prefix: Optional[str] = None,
+                 format_only: bool = False,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
         self.waymo_bin_file = waymo_bin_file
-        self.data_root = data_root
-        self.split = split
+        self.metrics = metric if isinstance(metric, list) else [metric]
         self.load_type = load_type
-        self.use_pred_sample_idx = use_pred_sample_idx
-        self.convert_kitti_format = convert_kitti_format
-
-        if idx2metainfo is not None:
-            self.idx2metainfo = mmengine.load(idx2metainfo)
-        else:
-            self.idx2metainfo = None
-
-        super(WaymoMetric, self).__init__(
-            ann_file=ann_file,
-            metric=metric,
-            pcd_limit_range=pcd_limit_range,
-            prefix=prefix,
-            pklfile_prefix=pklfile_prefix,
-            submission_prefix=submission_prefix,
-            default_cam_key=default_cam_key,
-            collect_device=collect_device,
-            backend_args=backend_args)
         self.format_only = format_only
+        self.result_prefix = result_prefix
         if self.format_only:
-            assert pklfile_prefix is not None, 'pklfile_prefix must be not '
+            assert result_prefix is not None, 'result_prefix must be not '
             'None when format_only is True, otherwise the result files will '
             'be saved to a temp directory which will be cleaned up at the end.'
 
-        self.default_prefix = 'Waymo metric'
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+
+        for data_sample in data_samples:
+            result = dict()
+            bboxes_3d = data_sample['pred_instances_3d']['bboxes_3d']
+            bboxes_3d.limit_yaw(offset=0.5, period=np.pi * 2)
+            scores_3d = data_sample['pred_instances_3d']['scores_3d']
+            labels_3d = data_sample['pred_instances_3d']['labels_3d']
+            # TODO: check lidar post-processing
+            if isinstance(bboxes_3d, CameraInstance3DBoxes):
+                box_corners = bboxes_3d.corners
+                cam2img = box_corners.new_tensor(
+                    np.array(data_sample['cam2img']))
+                box_corners_in_image = points_cam2img(box_corners, cam2img)
+                # box_corners_in_image: [N, 8, 2]
+                minxy = torch.min(box_corners_in_image, dim=1)[0]
+                maxxy = torch.max(box_corners_in_image, dim=1)[0]
+                # check minxy & maxxy
+                # if the projected 2d bbox has intersection
+                # with the image, we keep it, otherwise, we omit it.
+                img_shape = data_sample['img_shape']
+                valid_inds = ((minxy[:, 0] < img_shape[1]) &
+                              (minxy[:, 1] < img_shape[0]) & (maxxy[:, 0] > 0)
+                              & (maxxy[:, 1] > 0))
+
+                if valid_inds.sum() > 0:
+                    lidar2cam = data_sample['lidar2cam']
+                    bboxes_3d = bboxes_3d.convert_to(
+                        Box3DMode.LIDAR,
+                        np.linalg.inv(lidar2cam),
+                        correct_yaw=True)
+                    bboxes_3d = bboxes_3d[valid_inds]
+                    scores_3d = scores_3d[valid_inds]
+                    labels_3d = labels_3d[valid_inds]
+                else:
+                    bboxes_3d = torch.zeros([0, 7])
+                    scores_3d = torch.zeros([0])
+                    labels_3d = torch.zeros([0])
+            result['bboxes_3d'] = bboxes_3d.tensor.cpu().numpy()
+            result['scores_3d'] = scores_3d.cpu().numpy()
+            result['labels_3d'] = labels_3d.cpu().numpy()
+            result['sample_idx'] = data_sample['sample_idx']
+            result['context_name'] = data_sample['context_name']
+            result['timestamp'] = data_sample['timestamp']
+            self.results.append(result)
 
     def compute_metrics(self, results: List[dict]) -> Dict[str, float]:
         """Compute the metrics from processed results.
@@ -137,80 +129,49 @@ def compute_metrics(self, results: List[dict]) -> Dict[str, float]:
         logger: MMLogger = MMLogger.get_current_instance()
         self.classes = self.dataset_meta['classes']
 
-        # load annotations
-        self.data_infos = load(self.ann_file)['data_list']
-        assert len(results) == len(self.data_infos), \
-            'invalid list length of network outputs'
         # different from kitti, waymo do not need to convert the ann file
         # handle the mv_image_based load_mode
         if self.load_type == 'mv_image_based':
-            new_data_infos = []
-            for info in self.data_infos:
-                height = info['images'][self.default_cam_key]['height']
-                width = info['images'][self.default_cam_key]['width']
-                for (cam_key, img_info) in info['images'].items():
-                    camera_info = dict()
-                    camera_info['images'] = dict()
-                    camera_info['images'][cam_key] = img_info
-                    # TODO remove the check by updating the data info;
-                    if 'height' not in img_info:
-                        img_info['height'] = height
-                        img_info['width'] = width
-                    if 'cam_instances' in info \
-                            and cam_key in info['cam_instances']:
-                        camera_info['instances'] = info['cam_instances'][
-                            cam_key]
-                    else:
-                        camera_info['instances'] = []
-                    camera_info['ego2global'] = info['ego2global']
-                    if 'image_sweeps' in info:
-                        camera_info['image_sweeps'] = info['image_sweeps']
-
-                    # TODO check if need to modify the sample idx
-                    # TODO check when will use it except for evaluation.
-                    camera_info['sample_idx'] = info['sample_idx']
-                    new_data_infos.append(camera_info)
-            self.data_infos = new_data_infos
-
-        if self.pklfile_prefix is None:
+            assert len(results) % 5 == 0, 'The multi-view image-based results'
+            ' must be 5 times as large as the original frame-based results.'
+            frame_results = [
+                results[i:i + 5] for i in range(0, len(results), 5)
+            ]
+            results = self.merge_multi_view_boxes(frame_results)
+
+        if self.result_prefix is None:
             eval_tmp_dir = tempfile.TemporaryDirectory()
-            pklfile_prefix = osp.join(eval_tmp_dir.name, 'results')
+            result_prefix = osp.join(eval_tmp_dir.name, 'results')
         else:
             eval_tmp_dir = None
-            pklfile_prefix = self.pklfile_prefix
+            result_prefix = self.result_prefix
 
-        result_dict, tmp_dir = self.format_results(
-            results,
-            pklfile_prefix=pklfile_prefix,
-            submission_prefix=self.submission_prefix,
-            classes=self.classes)
+        self.format_results(results, result_prefix=result_prefix)
 
         metric_dict = {}
 
         if self.format_only:
             logger.info('results are saved in '
-                        f'{osp.dirname(self.pklfile_prefix)}')
+                        f'{osp.dirname(self.result_prefix)}')
             return metric_dict
 
         for metric in self.metrics:
             ap_dict = self.waymo_evaluate(
-                pklfile_prefix, metric=metric, logger=logger)
+                result_prefix, metric=metric, logger=logger)
             metric_dict.update(ap_dict)
         if eval_tmp_dir is not None:
             eval_tmp_dir.cleanup()
 
-        if tmp_dir is not None:
-            tmp_dir.cleanup()
         return metric_dict
 
     def waymo_evaluate(self,
-                       pklfile_prefix: str,
+                       result_prefix: str,
                        metric: Optional[str] = None,
                        logger: Optional[MMLogger] = None) -> Dict[str, float]:
         """Evaluation in Waymo protocol.
 
         Args:
-            pklfile_prefix (str): The location that stored the prediction
+            result_prefix (str): The location that stored the prediction
                 results.
             metric (str, optional): Metric to be evaluated. Defaults to None.
             logger (MMLogger, optional): Logger used for printing related
@@ -224,7 +185,7 @@ def waymo_evaluate(self,
 
         if metric == 'mAP':
             eval_str = 'mmdet3d/evaluation/functional/waymo_utils/' + \
-                f'compute_detection_metrics_main {pklfile_prefix}.bin ' + \
+                f'compute_detection_metrics_main {result_prefix}.bin ' + \
                 f'{self.waymo_bin_file}'
             print(eval_str)
             ret_bytes = subprocess.check_output(eval_str, shell=True)
@@ -275,7 +236,7 @@ def waymo_evaluate(self,
                     ap_dict['Cyclist/L2 mAPH']) / 3
         elif metric == 'LET_mAP':
             eval_str = 'mmdet3d/evaluation/functional/waymo_utils/' + \
-                f'compute_detection_let_metrics_main {pklfile_prefix}.bin ' + \
+                f'compute_detection_let_metrics_main {result_prefix}.bin ' + \
                 f'{self.waymo_bin_file}'
 
             print(eval_str)
@@ -325,76 +286,26 @@ def waymo_evaluate(self,
     def format_results(
         self,
         results: List[dict],
-        pklfile_prefix: Optional[str] = None,
-        submission_prefix: Optional[str] = None,
-        classes: Optional[List[str]] = None
+        result_prefix: Optional[str] = None
     ) -> Tuple[dict, Union[tempfile.TemporaryDirectory, None]]:
         """Format the results to bin file.
 
         Args:
             results (List[dict]): Testing results of the dataset.
-            pklfile_prefix (str, optional): The prefix of pkl files. It
+            result_prefix (str, optional): The prefix of result file. It
                 includes the file path and the prefix of filename, e.g.,
                 "a/b/prefix". If not specified, a temp file will be created.
                 Defaults to None.
-            submission_prefix (str, optional): The prefix of submitted files.
-                It includes the file path and the prefix of filename, e.g.,
-                "a/b/prefix". If not specified, a temp file will be created.
-                Defaults to None.
-            classes (List[str], optional): A list of class name.
-                Defaults to None.
-
-        Returns:
-            tuple: (result_dict, tmp_dir), result_dict is a dict containing the
-            formatted result, tmp_dir is the temporal directory created for
-            saving json files when jsonfile_prefix is not specified.
         """
-        waymo_save_tmp_dir = tempfile.TemporaryDirectory()
-        waymo_results_save_dir = waymo_save_tmp_dir.name
-        waymo_results_final_path = f'{pklfile_prefix}.bin'
-
-        if self.convert_kitti_format:
-            results_kitti_format, tmp_dir = super().format_results(
-                results, pklfile_prefix, submission_prefix, classes)
-            final_results = results_kitti_format['pred_instances_3d']
-        else:
-            final_results = results
-            for i, res in enumerate(final_results):
-                # Actually, `sample_idx` here is the filename without suffix.
-                # It's for identitying the sample in formating.
-                res['sample_idx'] = self.data_infos[i]['sample_idx']
-                res['pred_instances_3d']['bboxes_3d'].limit_yaw(
-                    offset=0.5, period=np.pi * 2)
-
-        waymo_root = self.data_root
-        if self.split == 'training':
-            waymo_tfrecords_dir = osp.join(waymo_root, 'validation')
-            prefix = '1'
-        elif self.split == 'testing':
-            waymo_tfrecords_dir = osp.join(waymo_root, 'testing')
-            prefix = '2'
-        else:
-            raise ValueError('Not supported split value.')
+        waymo_results_final_path = f'{result_prefix}.bin'
 
         from ..functional.waymo_utils.prediction_to_waymo import \
             Prediction2Waymo
-        converter = Prediction2Waymo(
-            final_results,
-            waymo_tfrecords_dir,
-            waymo_results_save_dir,
-            waymo_results_final_path,
-            prefix,
-            classes,
-            backend_args=self.backend_args,
-            from_kitti_format=self.convert_kitti_format,
-            idx2metainfo=self.idx2metainfo)
+        converter = Prediction2Waymo(results, waymo_results_final_path,
+                                     self.classes)
         converter.convert()
-        waymo_save_tmp_dir.cleanup()
-
-        return final_results, waymo_save_tmp_dir
 
-    def merge_multi_view_boxes(self, box_dict_per_frame: List[dict],
-                               cam0_info: dict) -> dict:
+    def merge_multi_view_boxes(self, frame_results) -> dict:
         """Merge bounding boxes predicted from multi-view images.
 
         Args:
@@ -403,308 +314,49 @@ def merge_multi_view_boxes(self, box_dict_per_frame: List[dict],
             cam0_info (dict): Store the sample idx for the given frame.
 
         Returns:
-            dict: Merged results.
-        """
-        box_dict = dict()
-        # convert list[dict] to dict[list]
-        for key in box_dict_per_frame[0].keys():
-            box_dict[key] = list()
-            for cam_idx in range(self.num_cams):
-                box_dict[key].append(box_dict_per_frame[cam_idx][key])
-        # merge each elements
-        box_dict['sample_idx'] = cam0_info['image_id']
-        for key in ['bbox', 'box3d_lidar', 'scores', 'label_preds']:
-            box_dict[key] = np.concatenate(box_dict[key])
-
-        # apply nms to box3d_lidar (box3d_camera are in different systems)
-        # TODO: move this global setting into config
-        nms_cfg = dict(
-            use_rotate_nms=True,
-            nms_across_levels=False,
-            nms_pre=500,
-            nms_thr=0.05,
-            score_thr=0.001,
-            min_bbox_size=0,
-            max_per_frame=100)
-        nms_cfg = Config(nms_cfg)
-        lidar_boxes3d = LiDARInstance3DBoxes(
-            torch.from_numpy(box_dict['box3d_lidar']).cuda())
-        scores = torch.from_numpy(box_dict['scores']).cuda()
-        labels = torch.from_numpy(box_dict['label_preds']).long().cuda()
-        nms_scores = scores.new_zeros(scores.shape[0], len(self.classes) + 1)
-        indices = labels.new_tensor(list(range(scores.shape[0])))
-        nms_scores[indices, labels] = scores
-        lidar_boxes3d_for_nms = xywhr2xyxyr(lidar_boxes3d.bev)
-        boxes3d = lidar_boxes3d.tensor
-        # generate attr scores from attr labels
-        boxes3d, scores, labels = box3d_multiclass_nms(
-            boxes3d, lidar_boxes3d_for_nms, nms_scores, nms_cfg.score_thr,
-            nms_cfg.max_per_frame, nms_cfg)
-        lidar_boxes3d = LiDARInstance3DBoxes(boxes3d)
-        det = bbox3d2result(lidar_boxes3d, scores, labels)
-        box_preds_lidar = det['bboxes_3d']
-        scores = det['scores_3d']
-        labels = det['labels_3d']
-        # box_preds_camera is in the cam0 system
-        lidar2cam = cam0_info['images'][self.default_cam_key]['lidar2img']
-        lidar2cam = np.array(lidar2cam).astype(np.float32)
-        box_preds_camera = box_preds_lidar.convert_to(
-            Box3DMode.CAM, lidar2cam, correct_yaw=True)
-        # Note: bbox is meaningless in final evaluation, set to 0
-        merged_box_dict = dict(
-            bbox=np.zeros([box_preds_lidar.tensor.shape[0], 4]),
-            box3d_camera=box_preds_camera.numpy(),
-            box3d_lidar=box_preds_lidar.numpy(),
-            scores=scores.numpy(),
-            label_preds=labels.numpy(),
-            sample_idx=box_dict['sample_idx'],
-        )
-        return merged_box_dict
-
-    def bbox2result_kitti(
-            self,
-            net_outputs: List[dict],
-            sample_idx_list: List[int],
-            class_names: List[str],
-            pklfile_prefix: Optional[str] = None,
-            submission_prefix: Optional[str] = None) -> List[dict]:
-        """Convert 3D detection results to kitti format for evaluation and test
-        submission.
-
-        Args:
-            net_outputs (List[dict]): List of dict storing the inferenced
-                bounding boxes and scores.
-            sample_idx_list (List[int]): List of input sample idx.
-            class_names (List[str]): A list of class names.
-            pklfile_prefix (str, optional): The prefix of pkl file.
-                Defaults to None.
-            submission_prefix (str, optional): The prefix of submission file.
-                Defaults to None.
-
-        Returns:
-            List[dict]: A list of dictionaries with the kitti format.
+            Dict: Merged results.
         """
-        if submission_prefix is not None:
-            mmengine.mkdir_or_exist(submission_prefix)
-
-        det_annos = []
-        print('\nConverting prediction to KITTI format')
-        for idx, pred_dicts in enumerate(
-                mmengine.track_iter_progress(net_outputs)):
-            sample_idx = sample_idx_list[idx]
-            info = self.data_infos[sample_idx]
-
-            if self.load_type == 'mv_image_based':
-                if idx % self.num_cams == 0:
-                    box_dict_per_frame = []
-                    cam0_key = list(info['images'].keys())[0]
-                    cam0_info = info
-                    # Here in mono3d, we use the 'CAM_FRONT' "the first
-                    # index in the camera" as the default image shape.
-                    # If you want to another camera, please modify it.
-                    image_shape = (info['images'][cam0_key]['height'],
-                                   info['images'][cam0_key]['width'])
-                box_dict = self.convert_valid_bboxes(pred_dicts, info)
-            else:
-                box_dict = self.convert_valid_bboxes(pred_dicts, info)
-                # Here default used 'CAM_FRONT' to compute metric.
-                # If you want to use another camera, please modify it.
-                image_shape = (info['images'][self.default_cam_key]['height'],
-                               info['images'][self.default_cam_key]['width'])
-            if self.load_type == 'mv_image_based':
-                box_dict_per_frame.append(box_dict)
-                if (idx + 1) % self.num_cams != 0:
-                    continue
-                box_dict = self.merge_multi_view_boxes(box_dict_per_frame,
-                                                       cam0_info)
-
-            anno = {
-                'name': [],
-                'truncated': [],
-                'occluded': [],
-                'alpha': [],
-                'bbox': [],
-                'dimensions': [],
-                'location': [],
-                'rotation_y': [],
-                'score': []
-            }
-            if len(box_dict['bbox']) > 0:
-                box_2d_preds = box_dict['bbox']
-                box_preds = box_dict['box3d_camera']
-                scores = box_dict['scores']
-                box_preds_lidar = box_dict['box3d_lidar']
-                label_preds = box_dict['label_preds']
-
-                for box, box_lidar, bbox, score, label in zip(
-                        box_preds, box_preds_lidar, box_2d_preds, scores,
-                        label_preds):
-                    bbox[2:] = np.minimum(bbox[2:], image_shape[::-1])
-                    bbox[:2] = np.maximum(bbox[:2], [0, 0])
-                    anno['name'].append(class_names[int(label)])
-                    anno['truncated'].append(0.0)
-                    anno['occluded'].append(0)
-                    anno['alpha'].append(
-                        -np.arctan2(-box_lidar[1], box_lidar[0]) + box[6])
-                    anno['bbox'].append(bbox)
-                    anno['dimensions'].append(box[3:6])
-                    anno['location'].append(box[:3])
-                    anno['rotation_y'].append(box[6])
-                    anno['score'].append(score)
-
-                anno = {k: np.stack(v) for k, v in anno.items()}
-            else:
-                anno = {
-                    'name': np.array([]),
-                    'truncated': np.array([]),
-                    'occluded': np.array([]),
-                    'alpha': np.array([]),
-                    'bbox': np.zeros([0, 4]),
-                    'dimensions': np.zeros([0, 3]),
-                    'location': np.zeros([0, 3]),
-                    'rotation_y': np.array([]),
-                    'score': np.array([]),
-                }
-
-            if submission_prefix is not None:
-                curr_file = f'{submission_prefix}/{sample_idx:06d}.txt'
-                with open(curr_file, 'w') as f:
-                    bbox = anno['bbox']
-                    loc = anno['location']
-                    dims = anno['dimensions']  # lhw -> hwl
-
-                    for idx in range(len(bbox)):
-                        print(
-                            '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} '
-                            '{:.4f} {:.4f} {:.4f} '
-                            '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.format(
-                                anno['name'][idx], anno['alpha'][idx],
-                                bbox[idx][0], bbox[idx][1], bbox[idx][2],
-                                bbox[idx][3], dims[idx][1], dims[idx][2],
-                                dims[idx][0], loc[idx][0], loc[idx][1],
-                                loc[idx][2], anno['rotation_y'][idx],
-                                anno['score'][idx]),
-                            file=f)
-            if self.use_pred_sample_idx:
-                save_sample_idx = sample_idx
-            else:
-                # use the sample idx in the info file
-                # In waymo validation sample_idx in prediction is 000xxx
-                # but in info file it is 1000xxx
-                save_sample_idx = box_dict['sample_idx']
-            anno['sample_idx'] = np.array(
-                [save_sample_idx] * len(anno['score']), dtype=np.int64)
-
-            det_annos.append(anno)
-
-        if pklfile_prefix is not None:
-            if not pklfile_prefix.endswith(('.pkl', '.pickle')):
-                out = f'{pklfile_prefix}.pkl'
-            else:
-                out = pklfile_prefix
-            mmengine.dump(det_annos, out)
-            print(f'Result is saved to {out}.')
-
-        return det_annos
-
-    def convert_valid_bboxes(self, box_dict: dict, info: dict) -> dict:
-        """Convert the predicted boxes into valid ones. Should handle the
-        load_model (frame_based, mv_image_based, fov_image_based), separately.
-
-        Args:
-            box_dict (dict): Box dictionaries to be converted.
-
-                - bboxes_3d (:obj:`BaseInstance3DBoxes`): 3D bounding boxes.
-                - scores_3d (Tensor): Scores of boxes.
-                - labels_3d (Tensor): Class labels of boxes.
-            info (dict): Data info.
-
-        Returns:
-            dict: Valid predicted boxes.
-
-            - bbox (np.ndarray): 2D bounding boxes.
-            - box3d_camera (np.ndarray): 3D bounding boxes in camera
-              coordinate.
-            - box3d_lidar (np.ndarray): 3D bounding boxes in LiDAR coordinate.
-            - scores (np.ndarray): Scores of boxes.
-            - label_preds (np.ndarray): Class label predictions.
-            - sample_idx (int): Sample index.
-        """
-        # TODO: refactor this function
-        box_preds = box_dict['bboxes_3d']
-        scores = box_dict['scores_3d']
-        labels = box_dict['labels_3d']
-        sample_idx = info['sample_idx']
-        box_preds.limit_yaw(offset=0.5, period=np.pi * 2)
-
-        if len(box_preds) == 0:
-            return dict(
-                bbox=np.zeros([0, 4]),
-                box3d_camera=np.zeros([0, 7]),
-                box3d_lidar=np.zeros([0, 7]),
-                scores=np.zeros([0]),
-                label_preds=np.zeros([0, 4]),
-                sample_idx=sample_idx)
-        # Here default used 'CAM_FRONT' to compute metric. If you want to
-        # use another camera, please modify it.
-        if self.load_type in ['frame_based', 'fov_image_based']:
-            cam_key = self.default_cam_key
-        elif self.load_type == 'mv_image_based':
-            cam_key = list(info['images'].keys())[0]
-        else:
-            raise NotImplementedError
-
-        lidar2cam = np.array(info['images'][cam_key]['lidar2cam']).astype(
-            np.float32)
-        P2 = np.array(info['images'][cam_key]['cam2img']).astype(np.float32)
-        img_shape = (info['images'][cam_key]['height'],
-                     info['images'][cam_key]['width'])
-        P2 = box_preds.tensor.new_tensor(P2)
-
-        if isinstance(box_preds, LiDARInstance3DBoxes):
-            box_preds_camera = box_preds.convert_to(Box3DMode.CAM, lidar2cam)
-            box_preds_lidar = box_preds
-        elif isinstance(box_preds, CameraInstance3DBoxes):
-            box_preds_camera = box_preds
-            box_preds_lidar = box_preds.convert_to(Box3DMode.LIDAR,
-                                                   np.linalg.inv(lidar2cam))
-
-        box_corners = box_preds_camera.corners
-        box_corners_in_image = points_cam2img(box_corners, P2)
-        # box_corners_in_image: [N, 8, 2]
-        minxy = torch.min(box_corners_in_image, dim=1)[0]
-        maxxy = torch.max(box_corners_in_image, dim=1)[0]
-        box_2d_preds = torch.cat([minxy, maxxy], dim=1)
-        # Post-processing
-        # check box_preds_camera
-        image_shape = box_preds.tensor.new_tensor(img_shape)
-        valid_cam_inds = ((box_2d_preds[:, 0] < image_shape[1]) &
-                          (box_2d_preds[:, 1] < image_shape[0]) &
-                          (box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0))
-        # check box_preds_lidar
-        if self.load_type in ['frame_based']:
-            limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
-            valid_pcd_inds = ((box_preds_lidar.center > limit_range[:3]) &
-                              (box_preds_lidar.center < limit_range[3:]))
-            valid_inds = valid_pcd_inds.all(-1)
-        elif self.load_type in ['mv_image_based', 'fov_image_based']:
-            valid_inds = valid_cam_inds
-
-        if valid_inds.sum() > 0:
-            return dict(
-                bbox=box_2d_preds[valid_inds, :].numpy(),
-                pred_box_type_3d=type(box_preds),
-                box3d_camera=box_preds_camera[valid_inds].numpy(),
-                box3d_lidar=box_preds_lidar[valid_inds].numpy(),
-                scores=scores[valid_inds].numpy(),
-                label_preds=labels[valid_inds].numpy(),
-                sample_idx=sample_idx)
-        else:
-            return dict(
-                bbox=np.zeros([0, 4]),
-                pred_box_type_3d=type(box_preds),
-                box3d_camera=np.zeros([0, 7]),
-                box3d_lidar=np.zeros([0, 7]),
-                scores=np.zeros([0]),
-                label_preds=np.zeros([0]),
-                sample_idx=sample_idx)
+        merged_results = []
+        for frame_result in frame_results:
+            merged_result = dict()
+            merged_result['sample_idx'] = frame_result[0]['sample_idx'] // 5
+            merged_result['context_name'] = frame_result[0]['context_name']
+            merged_result['timestamp'] = frame_result[0]['timestamp']
+            bboxes_3d, scores_3d, labels_3d = [], [], []
+            for result in frame_result:
+                assert result['timestamp'] == merged_result['timestamp']
+                bboxes_3d.append(result['bboxes_3d'])
+                scores_3d.append(result['scores_3d'])
+                labels_3d.append(result['labels_3d'])
+
+            bboxes_3d = np.concatenate(bboxes_3d)
+            scores_3d = np.concatenate(scores_3d)
+            labels_3d = np.concatenate(labels_3d)
+            nms_cfg = dict(
+                use_rotate_nms=True,
+                nms_across_levels=False,
+                nms_pre=500,
+                nms_thr=0.05,
+                score_thr=0.001,
+                min_bbox_size=0,
+                max_per_frame=100)
+            nms_cfg = Config(nms_cfg)
+            lidar_boxes3d = LiDARInstance3DBoxes(
+                torch.from_numpy(bboxes_3d).cuda())
+            scores = torch.from_numpy(scores_3d).cuda()
+            labels = torch.from_numpy(labels_3d).long().cuda()
+            nms_scores = scores.new_zeros(scores.shape[0],
+                                          len(self.classes) + 1)
+            indices = labels.new_tensor(list(range(scores.shape[0])))
+            nms_scores[indices, labels] = scores
+            lidar_boxes3d_for_nms = xywhr2xyxyr(lidar_boxes3d.bev)
+            boxes3d = lidar_boxes3d.tensor
+            bboxes_3d, scores_3d, labels_3d = box3d_multiclass_nms(
+                boxes3d, lidar_boxes3d_for_nms, nms_scores, nms_cfg.score_thr,
+                nms_cfg.max_per_frame, nms_cfg)
+
+            merged_result['bboxes_3d'] = bboxes_3d.cpu().numpy()
+            merged_result['scores_3d'] = scores_3d.cpu().numpy()
+            merged_result['labels_3d'] = labels_3d.cpu().numpy()
+            merged_results.append(merged_result)
+        return merged_results
diff --git a/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py b/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
index 14bcbb9296..5b207c7992 100644
--- a/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
+++ b/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
@@ -179,7 +179,10 @@
             dict(
                 type='PointsRangeFilter', point_cloud_range=point_cloud_range)
         ]),
-    dict(type='Pack3DDetInputs', keys=['points'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 
 dataset_type = 'WaymoDataset'
@@ -223,13 +226,7 @@
 test_dataloader = val_dataloader
 
 val_evaluator = dict(
-    type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-    data_root='./data/waymo/waymo_format',
-    backend_args=backend_args,
-    convert_kitti_format=False,
-    idx2metainfo='./data/waymo/waymo_format/idx2metainfo.pkl')
+    type='WaymoMetric', waymo_bin_file='./data/waymo/waymo_format/gt.bin')
 test_evaluator = val_evaluator
 
 vis_backends = [dict(type='LocalVisBackend')]
diff --git a/tools/create_data.py b/tools/create_data.py
index 34356c2a8f..384fa87127 100644
--- a/tools/create_data.py
+++ b/tools/create_data.py
@@ -2,6 +2,8 @@
 import argparse
 from os import path as osp
 
+from mmengine import print_log
+
 from tools.dataset_converters import indoor_converter as indoor
 from tools.dataset_converters import kitti_converter as kitti
 from tools.dataset_converters import lyft_converter as lyft_converter
@@ -171,8 +173,19 @@ def waymo_data_prep(root_path,
                     version,
                     out_dir,
                     workers,
-                    max_sweeps=5):
-    """Prepare the info file for waymo dataset.
+                    max_sweeps=10,
+                    only_gt_database=False,
+                    skip_image_and_lidar=False,
+                    skip_cam_instances_infos=False):
+    """Prepare waymo dataset. There are 3 steps as follows:
+
+    Step 1. Extract camera images and lidar point clouds from waymo raw
+        data in '*.tfreord' and save as kitti format.
+    Step 2. Generate waymo train/val/test infos and save as pickle file.
+    Step 3. Generate waymo ground truth database (point clouds within
+        each 3D bounding box) for data augmentation in training.
+    Steps 1 and 2 will be done in Waymo2KITTI, and step 3 will be done in
+    GTDatabaseCreater.
 
     Args:
         root_path (str): Path of dataset root.
@@ -180,44 +193,55 @@ def waymo_data_prep(root_path,
         out_dir (str): Output directory of the generated info file.
         workers (int): Number of threads to be used.
         max_sweeps (int, optional): Number of input consecutive frames.
-            Default: 5. Here we store pose information of these frames
-            for later use.
+            Default to 10. Here we store ego2global information of these
+            frames for later use.
+        only_gt_database (bool, optional): Whether to only generate ground
+            truth database. Default to False.
+        skip_image_and_lidar (bool, optional): Whether to skip saving
+            image and lidar. Default to False.
+        skip_cam_instances_infos (bool, optional): Whether to skip
+            gathering cam_instances infos in Step 2. Default to False.
     """
     from tools.dataset_converters import waymo_converter as waymo
 
-    splits = [
-        'training', 'validation', 'testing', 'testing_3d_camera_only_detection'
-    ]
-    for i, split in enumerate(splits):
-        load_dir = osp.join(root_path, 'waymo_format', split)
-        if split == 'validation':
-            save_dir = osp.join(out_dir, 'kitti_format', 'training')
-        else:
-            save_dir = osp.join(out_dir, 'kitti_format', split)
-        converter = waymo.Waymo2KITTI(
-            load_dir,
-            save_dir,
-            prefix=str(i),
-            workers=workers,
-            test_mode=(split
-                       in ['testing', 'testing_3d_camera_only_detection']))
-        converter.convert()
-
-    from tools.dataset_converters.waymo_converter import \
-        create_ImageSets_img_ids
-    create_ImageSets_img_ids(osp.join(out_dir, 'kitti_format'), splits)
-    # Generate waymo infos
+    if version == 'v1.4':
+        splits = [
+            'training', 'validation', 'testing',
+            'testing_3d_camera_only_detection'
+        ]
+    elif version == 'v1.4-mini':
+        splits = ['training', 'validation']
+    else:
+        raise NotImplementedError(f'Unsupported Waymo version {version}!')
     out_dir = osp.join(out_dir, 'kitti_format')
-    kitti.create_waymo_info_file(
-        out_dir, info_prefix, max_sweeps=max_sweeps, workers=workers)
-    info_train_path = osp.join(out_dir, f'{info_prefix}_infos_train.pkl')
-    info_val_path = osp.join(out_dir, f'{info_prefix}_infos_val.pkl')
-    info_trainval_path = osp.join(out_dir, f'{info_prefix}_infos_trainval.pkl')
-    info_test_path = osp.join(out_dir, f'{info_prefix}_infos_test.pkl')
-    update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_train_path)
-    update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_val_path)
-    update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_trainval_path)
-    update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_test_path)
+
+    if not only_gt_database:
+        for i, split in enumerate(splits):
+            load_dir = osp.join(root_path, 'waymo_format', split)
+            if split == 'validation':
+                save_dir = osp.join(out_dir, 'training')
+            else:
+                save_dir = osp.join(out_dir, split)
+            converter = waymo.Waymo2KITTI(
+                load_dir,
+                save_dir,
+                prefix=str(i),
+                workers=workers,
+                test_mode=(split
+                           in ['testing', 'testing_3d_camera_only_detection']),
+                info_prefix=info_prefix,
+                max_sweeps=max_sweeps,
+                split=split,
+                save_image_and_lidar=not skip_image_and_lidar,
+                save_cam_instances=not skip_cam_instances_infos)
+            converter.convert()
+            if split == 'validation':
+                converter.merge_trainval_infos()
+
+        from tools.dataset_converters.waymo_converter import \
+            create_ImageSets_img_ids
+        create_ImageSets_img_ids(out_dir, splits)
+
     GTDatabaseCreater(
         'WaymoDataset',
         out_dir,
@@ -227,6 +251,8 @@ def waymo_data_prep(root_path,
         with_mask=False,
         num_worker=workers).create()
 
+    print_log('Successfully preparing Waymo Open Dataset')
+
 
 def semantickitti_data_prep(info_prefix, out_dir):
     """Prepare the info file for SemanticKITTI dataset.
@@ -274,12 +300,23 @@ def semantickitti_data_prep(info_prefix, out_dir):
 parser.add_argument(
     '--only-gt-database',
     action='store_true',
-    help='Whether to only generate ground truth database.')
+    help='''Whether to only generate ground truth database.
+        Only used when dataset is NuScenes or Waymo!''')
+parser.add_argument(
+    '--skip-cam_instances-infos',
+    action='store_true',
+    help='''Whether to skip gathering cam_instances infos.
+        Only used when dataset is Waymo!''')
+parser.add_argument(
+    '--skip-image-and-lidar',
+    action='store_true',
+    help='''Whether to skip saving image and lidar.
+        Only used when dataset is Waymo!''')
 args = parser.parse_args()
 
 if __name__ == '__main__':
-    from mmdet3d.utils import register_all_modules
-    register_all_modules()
+    from mmengine.registry import init_default_scope
+    init_default_scope('mmdet3d')
 
     if args.dataset == 'kitti':
         if args.only_gt_database:
@@ -334,6 +371,17 @@ def semantickitti_data_prep(info_prefix, out_dir):
                 dataset_name='NuScenesDataset',
                 out_dir=args.out_dir,
                 max_sweeps=args.max_sweeps)
+    elif args.dataset == 'waymo':
+        waymo_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=args.version,
+            out_dir=args.out_dir,
+            workers=args.workers,
+            max_sweeps=args.max_sweeps,
+            only_gt_database=args.only_gt_database,
+            skip_image_and_lidar=args.skip_image_and_lidar,
+            skip_cam_instances_infos=args.skip_cam_instances_infos)
     elif args.dataset == 'lyft':
         train_version = f'{args.version}-train'
         lyft_data_prep(
@@ -347,14 +395,6 @@ def semantickitti_data_prep(info_prefix, out_dir):
             info_prefix=args.extra_tag,
             version=test_version,
             max_sweeps=args.max_sweeps)
-    elif args.dataset == 'waymo':
-        waymo_data_prep(
-            root_path=args.root_path,
-            info_prefix=args.extra_tag,
-            version=args.version,
-            out_dir=args.out_dir,
-            workers=args.workers,
-            max_sweeps=args.max_sweeps)
     elif args.dataset == 'scannet':
         scannet_data_prep(
             root_path=args.root_path,
diff --git a/tools/create_data.sh b/tools/create_data.sh
index 9a57852f71..0a1946585d 100755
--- a/tools/create_data.sh
+++ b/tools/create_data.sh
@@ -6,10 +6,11 @@ export PYTHONPATH=`pwd`:$PYTHONPATH
 PARTITION=$1
 JOB_NAME=$2
 DATASET=$3
+WORKERS=$4
 GPUS=${GPUS:-1}
 GPUS_PER_NODE=${GPUS_PER_NODE:-1}
 SRUN_ARGS=${SRUN_ARGS:-""}
-JOB_NAME=create_data
+PY_ARGS=${@:5}
 
 srun -p ${PARTITION} \
     --job-name=${JOB_NAME} \
@@ -21,4 +22,6 @@ srun -p ${PARTITION} \
     python -u tools/create_data.py ${DATASET} \
             --root-path ./data/${DATASET} \
             --out-dir ./data/${DATASET} \
-            --extra-tag ${DATASET}
+            --workers ${WORKERS} \
+            --extra-tag ${DATASET} \
+            ${PY_ARGS}
diff --git a/tools/dataset_converters/create_gt_database.py b/tools/dataset_converters/create_gt_database.py
index ae452eb543..fb84256fd8 100644
--- a/tools/dataset_converters/create_gt_database.py
+++ b/tools/dataset_converters/create_gt_database.py
@@ -7,7 +7,7 @@
 import numpy as np
 from mmcv.ops import roi_align
 from mmdet.evaluation import bbox_overlaps
-from mmengine import track_iter_progress
+from mmengine import print_log, track_iter_progress
 from pycocotools import mask as maskUtils
 from pycocotools.coco import COCO
 
@@ -504,7 +504,9 @@ def create_single(self, input_dict):
         return single_db_infos
 
     def create(self):
-        print(f'Create GT Database of {self.dataset_class_name}')
+        print_log(
+            f'Create GT Database of {self.dataset_class_name}',
+            logger='current')
         dataset_cfg = dict(
             type=self.dataset_class_name,
             data_root=self.data_path,
@@ -610,12 +612,19 @@ def loop_dataset(i):
             input_dict['box_mode_3d'] = self.dataset.box_mode_3d
             return input_dict
 
-        multi_db_infos = mmengine.track_parallel_progress(
-            self.create_single,
-            ((loop_dataset(i)
-              for i in range(len(self.dataset))), len(self.dataset)),
-            self.num_worker)
-        print('Make global unique group id')
+        if self.num_worker == 0:
+            multi_db_infos = mmengine.track_progress(
+                self.create_single,
+                ((loop_dataset(i)
+                  for i in range(len(self.dataset))), len(self.dataset)))
+        else:
+            multi_db_infos = mmengine.track_parallel_progress(
+                self.create_single,
+                ((loop_dataset(i)
+                  for i in range(len(self.dataset))), len(self.dataset)),
+                self.num_worker,
+                chunksize=1000)
+        print_log('Make global unique group id', logger='current')
         group_counter_offset = 0
         all_db_infos = dict()
         for single_db_infos in track_iter_progress(multi_db_infos):
@@ -630,7 +639,8 @@ def loop_dataset(i):
             group_counter_offset += (group_id + 1)
 
         for k, v in all_db_infos.items():
-            print(f'load {len(v)} {k} database infos')
+            print_log(f'load {len(v)} {k} database infos', logger='current')
 
+        print_log(f'Saving GT database infos into {self.db_info_save_path}')
         with open(self.db_info_save_path, 'wb') as f:
             pickle.dump(all_db_infos, f)
diff --git a/tools/dataset_converters/waymo_converter.py b/tools/dataset_converters/waymo_converter.py
index 87f9c54b54..00eba35daa 100644
--- a/tools/dataset_converters/waymo_converter.py
+++ b/tools/dataset_converters/waymo_converter.py
@@ -9,23 +9,33 @@
     raise ImportError('Please run "pip install waymo-open-dataset-tf-2-6-0" '
                       '>1.4.5 to install the official devkit first.')
 
+import copy
 import os
+import os.path as osp
 from glob import glob
+from io import BytesIO
 from os.path import exists, join
 
 import mmengine
 import numpy as np
 import tensorflow as tf
+from mmengine import print_log
+from nuscenes.utils.geometry_utils import view_points
+from PIL import Image
 from waymo_open_dataset.utils import range_image_utils, transform_utils
 from waymo_open_dataset.utils.frame_utils import \
     parse_range_image_and_camera_projection
 
+from mmdet3d.datasets.convert_utils import post_process_coords
+from mmdet3d.structures import Box3DMode, LiDARInstance3DBoxes, points_cam2img
+
 
 class Waymo2KITTI(object):
-    """Waymo to KITTI converter.
+    """Waymo to KITTI converter. There are 2 steps as follows:
 
-    This class serves as the converter to change the waymo raw data to KITTI
-    format.
+    Step 1. Extract camera images and lidar point clouds from waymo raw data in
+        '*.tfreord' and save as kitti format.
+    Step 2. Generate waymo train/val/test infos and save as pickle file.
 
     Args:
         load_dir (str): Directory to load waymo raw data.
@@ -36,8 +46,16 @@ class Waymo2KITTI(object):
             Defaults to 64.
         test_mode (bool, optional): Whether in the test_mode.
             Defaults to False.
-        save_cam_sync_labels (bool, optional): Whether to save cam sync labels.
-            Defaults to True.
+        save_image_and_lidar (bool, optional): Whether to save image and lidar
+            data. Defaults to True.
+        save_cam_sync_instances (bool, optional): Whether to save cam sync
+            instances. Defaults to True.
+        save_cam_instances (bool, optional): Whether to save cam instances.
+            Defaults to False.
+        info_prefix (str, optional): Prefix of info filename.
+            Defaults to 'waymo'.
+        max_sweeps (int, optional): Max length of sweeps. Defaults to 10.
+        split (str, optional): Split of the data. Defaults to 'training'.
     """
 
     def __init__(self,
@@ -46,18 +64,12 @@ def __init__(self,
                  prefix,
                  workers=64,
                  test_mode=False,
-                 save_cam_sync_labels=True):
-        self.filter_empty_3dboxes = True
-        self.filter_no_label_zone_points = True
-
-        self.selected_waymo_classes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST']
-
-        # Only data collected in specific locations will be converted
-        # If set None, this filter is disabled
-        # Available options: location_sf (main dataset)
-        self.selected_waymo_locations = None
-        self.save_track_id = False
-
+                 save_image_and_lidar=True,
+                 save_cam_sync_instances=True,
+                 save_cam_instances=True,
+                 info_prefix='waymo',
+                 max_sweeps=10,
+                 split='training'):
         # turn on eager execution for older tensorflow versions
         if int(tf.__version__.split('.')[0]) < 2:
             tf.enable_eager_execution()
@@ -74,12 +86,21 @@ def __init__(self,
         self.type_list = [
             'UNKNOWN', 'VEHICLE', 'PEDESTRIAN', 'SIGN', 'CYCLIST'
         ]
-        self.waymo_to_kitti_class_map = {
-            'UNKNOWN': 'DontCare',
-            'PEDESTRIAN': 'Pedestrian',
-            'VEHICLE': 'Car',
-            'CYCLIST': 'Cyclist',
-            'SIGN': 'Sign'  # not in kitti
+
+        # MMDetection3D unified camera keys & class names
+        self.camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_LEFT',
+            'CAM_FRONT_RIGHT',
+            'CAM_SIDE_LEFT',
+            'CAM_SIDE_RIGHT',
+        ]
+        self.selected_waymo_classes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST']
+        self.info_map = {
+            'training': '_infos_train.pkl',
+            'validation': '_infos_val.pkl',
+            'testing': '_infos_test.pkl',
+            'testing_3d_camera_only_detection': '_infos_test_cam_only.pkl'
         }
 
         self.load_dir = load_dir
@@ -87,61 +108,86 @@ def __init__(self,
         self.prefix = prefix
         self.workers = int(workers)
         self.test_mode = test_mode
-        self.save_cam_sync_labels = save_cam_sync_labels
+        self.save_image_and_lidar = save_image_and_lidar
+        self.save_cam_sync_instances = save_cam_sync_instances
+        self.save_cam_instances = save_cam_instances
+        self.info_prefix = info_prefix
+        self.max_sweeps = max_sweeps
+        self.split = split
+
+        # TODO: Discuss filter_empty_3dboxes and filter_no_label_zone_points
+        self.filter_empty_3dboxes = True
+        self.filter_no_label_zone_points = True
+        self.save_track_id = False
 
         self.tfrecord_pathnames = sorted(
             glob(join(self.load_dir, '*.tfrecord')))
 
-        self.label_save_dir = f'{self.save_dir}/label_'
-        self.label_all_save_dir = f'{self.save_dir}/label_all'
         self.image_save_dir = f'{self.save_dir}/image_'
-        self.calib_save_dir = f'{self.save_dir}/calib'
         self.point_cloud_save_dir = f'{self.save_dir}/velodyne'
-        self.pose_save_dir = f'{self.save_dir}/pose'
-        self.timestamp_save_dir = f'{self.save_dir}/timestamp'
-        if self.save_cam_sync_labels:
-            self.cam_sync_label_save_dir = f'{self.save_dir}/cam_sync_label_'
-            self.cam_sync_label_all_save_dir = \
-                f'{self.save_dir}/cam_sync_label_all'
 
-        self.create_folder()
+        # Create folder for saving KITTI format camera images and
+        # lidar point clouds.
+        if 'testing_3d_camera_only_detection' not in self.load_dir:
+            mmengine.mkdir_or_exist(self.point_cloud_save_dir)
+        for i in range(5):
+            mmengine.mkdir_or_exist(f'{self.image_save_dir}{str(i)}')
 
     def convert(self):
         """Convert action."""
-        print('Start converting ...')
-        mmengine.track_parallel_progress(self.convert_one, range(len(self)),
-                                         self.workers)
-        print('\nFinished ...')
+        print_log(f'Start converting {self.split} dataset', logger='current')
+        if self.workers == 0:
+            data_infos = mmengine.track_progress(self.convert_one,
+                                                 range(len(self)))
+        else:
+            data_infos = mmengine.track_parallel_progress(
+                self.convert_one, range(len(self)), self.workers)
+        data_list = []
+        for data_info in data_infos:
+            data_list.extend(data_info)
+        metainfo = dict()
+        metainfo['dataset'] = 'waymo'
+        metainfo['version'] = '1.4'
+        metainfo['info_version'] = '1.1'
+        waymo_infos = dict(data_list=data_list, metainfo=metainfo)
+        filenames = osp.join(
+            osp.dirname(self.save_dir),
+            f'{self.info_prefix + self.info_map[self.split]}')
+        print_log(f'Saving {self.split} dataset infos into {filenames}')
+        mmengine.dump(waymo_infos, filenames)
 
     def convert_one(self, file_idx):
-        """Convert action for single file.
+        """Convert one '*.tfrecord' file to kitti format. Each file stores all
+        the frames (about 200 frames) in current scene. We treat each frame as
+        a sample, save their images and point clouds in kitti format, and then
+        create info for all frames.
 
         Args:
             file_idx (int): Index of the file to be converted.
+
+        Returns:
+            file_infos (list): Waymo infos for all frames in current file.
         """
         pathname = self.tfrecord_pathnames[file_idx]
         dataset = tf.data.TFRecordDataset(pathname, compression_type='')
 
+        # NOTE: file_infos is not shared between processes, only stores frame
+        # infos within the current file.
+        file_infos = []
         for frame_idx, data in enumerate(dataset):
 
             frame = dataset_pb2.Frame()
             frame.ParseFromString(bytearray(data.numpy()))
-            if (self.selected_waymo_locations is not None
-                    and frame.context.stats.location
-                    not in self.selected_waymo_locations):
-                continue
 
-            self.save_image(frame, file_idx, frame_idx)
-            self.save_calib(frame, file_idx, frame_idx)
-            self.save_lidar(frame, file_idx, frame_idx)
-            self.save_pose(frame, file_idx, frame_idx)
-            self.save_timestamp(frame, file_idx, frame_idx)
+            # Step 1.
+            if self.save_image_and_lidar:
+                self.save_image(frame, file_idx, frame_idx)
+                self.save_lidar(frame, file_idx, frame_idx)
 
-            if not self.test_mode:
-                # TODO save the depth image for waymo challenge solution.
-                self.save_label(frame, file_idx, frame_idx)
-                if self.save_cam_sync_labels:
-                    self.save_label(frame, file_idx, frame_idx, cam_sync=True)
+            # Step 2.
+            # TODO save the depth image for waymo challenge solution.
+            self.create_waymo_info_file(frame, file_idx, frame_idx, file_infos)
+        return file_infos
 
     def __len__(self):
         """Length of the filename list."""
@@ -162,62 +208,6 @@ def save_image(self, frame, file_idx, frame_idx):
             with open(img_path, 'wb') as fp:
                 fp.write(img.image)
 
-    def save_calib(self, frame, file_idx, frame_idx):
-        """Parse and save the calibration data.
-
-        Args:
-            frame (:obj:`Frame`): Open dataset frame proto.
-            file_idx (int): Current file index.
-            frame_idx (int): Current frame index.
-        """
-        # waymo front camera to kitti reference camera
-        T_front_cam_to_ref = np.array([[0.0, -1.0, 0.0], [0.0, 0.0, -1.0],
-                                       [1.0, 0.0, 0.0]])
-        camera_calibs = []
-        R0_rect = [f'{i:e}' for i in np.eye(3).flatten()]
-        Tr_velo_to_cams = []
-        calib_context = ''
-
-        for camera in frame.context.camera_calibrations:
-            # extrinsic parameters
-            T_cam_to_vehicle = np.array(camera.extrinsic.transform).reshape(
-                4, 4)
-            T_vehicle_to_cam = np.linalg.inv(T_cam_to_vehicle)
-            Tr_velo_to_cam = \
-                self.cart_to_homo(T_front_cam_to_ref) @ T_vehicle_to_cam
-            if camera.name == 1:  # FRONT = 1, see dataset.proto for details
-                self.T_velo_to_front_cam = Tr_velo_to_cam.copy()
-            Tr_velo_to_cam = Tr_velo_to_cam[:3, :].reshape((12, ))
-            Tr_velo_to_cams.append([f'{i:e}' for i in Tr_velo_to_cam])
-
-            # intrinsic parameters
-            camera_calib = np.zeros((3, 4))
-            camera_calib[0, 0] = camera.intrinsic[0]
-            camera_calib[1, 1] = camera.intrinsic[1]
-            camera_calib[0, 2] = camera.intrinsic[2]
-            camera_calib[1, 2] = camera.intrinsic[3]
-            camera_calib[2, 2] = 1
-            camera_calib = list(camera_calib.reshape(12))
-            camera_calib = [f'{i:e}' for i in camera_calib]
-            camera_calibs.append(camera_calib)
-
-        # all camera ids are saved as id-1 in the result because
-        # camera 0 is unknown in the proto
-        for i in range(5):
-            calib_context += 'P' + str(i) + ': ' + \
-                ' '.join(camera_calibs[i]) + '\n'
-        calib_context += 'R0_rect' + ': ' + ' '.join(R0_rect) + '\n'
-        for i in range(5):
-            calib_context += 'Tr_velo_to_cam_' + str(i) + ': ' + \
-                ' '.join(Tr_velo_to_cams[i]) + '\n'
-
-        with open(
-                f'{self.calib_save_dir}/{self.prefix}' +
-                f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt',
-                'w+') as fp_calib:
-            fp_calib.write(calib_context)
-            fp_calib.close()
-
     def save_lidar(self, frame, file_idx, frame_idx):
         """Parse and save the lidar data in psd format.
 
@@ -275,194 +265,6 @@ def save_lidar(self, frame, file_idx, frame_idx):
             f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.bin'
         point_cloud.astype(np.float32).tofile(pc_path)
 
-    def save_label(self, frame, file_idx, frame_idx, cam_sync=False):
-        """Parse and save the label data in txt format.
-        The relation between waymo and kitti coordinates is noteworthy:
-        1. x, y, z correspond to l, w, h (waymo) -> l, h, w (kitti)
-        2. x-y-z: front-left-up (waymo) -> right-down-front(kitti)
-        3. bbox origin at volumetric center (waymo) -> bottom center (kitti)
-        4. rotation: +x around y-axis (kitti) -> +x around z-axis (waymo)
-
-        Args:
-            frame (:obj:`Frame`): Open dataset frame proto.
-            file_idx (int): Current file index.
-            frame_idx (int): Current frame index.
-            cam_sync (bool, optional): Whether to save the cam sync labels.
-                Defaults to False.
-        """
-        label_all_path = f'{self.label_all_save_dir}/{self.prefix}' + \
-            f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'
-        if cam_sync:
-            label_all_path = label_all_path.replace('label_',
-                                                    'cam_sync_label_')
-        fp_label_all = open(label_all_path, 'w+')
-        id_to_bbox = dict()
-        id_to_name = dict()
-        for labels in frame.projected_lidar_labels:
-            name = labels.name
-            for label in labels.labels:
-                # TODO: need a workaround as bbox may not belong to front cam
-                bbox = [
-                    label.box.center_x - label.box.length / 2,
-                    label.box.center_y - label.box.width / 2,
-                    label.box.center_x + label.box.length / 2,
-                    label.box.center_y + label.box.width / 2
-                ]
-                id_to_bbox[label.id] = bbox
-                id_to_name[label.id] = name - 1
-
-        for obj in frame.laser_labels:
-            bounding_box = None
-            name = None
-            id = obj.id
-            for proj_cam in self.cam_list:
-                if id + proj_cam in id_to_bbox:
-                    bounding_box = id_to_bbox.get(id + proj_cam)
-                    name = str(id_to_name.get(id + proj_cam))
-                    break
-
-            # NOTE: the 2D labels do not have strict correspondence with
-            # the projected 2D lidar labels
-            # e.g.: the projected 2D labels can be in camera 2
-            # while the most_visible_camera can have id 4
-            if cam_sync:
-                if obj.most_visible_camera_name:
-                    name = str(
-                        self.cam_list.index(
-                            f'_{obj.most_visible_camera_name}'))
-                    box3d = obj.camera_synced_box
-                else:
-                    continue
-            else:
-                box3d = obj.box
-
-            if bounding_box is None or name is None:
-                name = '0'
-                bounding_box = (0, 0, 0, 0)
-
-            my_type = self.type_list[obj.type]
-
-            if my_type not in self.selected_waymo_classes:
-                continue
-
-            if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1:
-                continue
-
-            my_type = self.waymo_to_kitti_class_map[my_type]
-
-            height = box3d.height
-            width = box3d.width
-            length = box3d.length
-
-            x = box3d.center_x
-            y = box3d.center_y
-            z = box3d.center_z - height / 2
-
-            # project bounding box to the virtual reference frame
-            pt_ref = self.T_velo_to_front_cam @ \
-                np.array([x, y, z, 1]).reshape((4, 1))
-            x, y, z, _ = pt_ref.flatten().tolist()
-
-            rotation_y = -box3d.heading - np.pi / 2
-            track_id = obj.id
-
-            # not available
-            truncated = 0
-            occluded = 0
-            alpha = -10
-
-            line = my_type + \
-                ' {} {} {} {} {} {} {} {} {} {} {} {} {} {}\n'.format(
-                    round(truncated, 2), occluded, round(alpha, 2),
-                    round(bounding_box[0], 2), round(bounding_box[1], 2),
-                    round(bounding_box[2], 2), round(bounding_box[3], 2),
-                    round(height, 2), round(width, 2), round(length, 2),
-                    round(x, 2), round(y, 2), round(z, 2),
-                    round(rotation_y, 2))
-
-            if self.save_track_id:
-                line_all = line[:-1] + ' ' + name + ' ' + track_id + '\n'
-            else:
-                line_all = line[:-1] + ' ' + name + '\n'
-
-            label_path = f'{self.label_save_dir}{name}/{self.prefix}' + \
-                f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'
-            if cam_sync:
-                label_path = label_path.replace('label_', 'cam_sync_label_')
-            fp_label = open(label_path, 'a')
-            fp_label.write(line)
-            fp_label.close()
-
-            fp_label_all.write(line_all)
-
-        fp_label_all.close()
-
-    def save_pose(self, frame, file_idx, frame_idx):
-        """Parse and save the pose data.
-
-        Note that SDC's own pose is not included in the regular training
-        of KITTI dataset. KITTI raw dataset contains ego motion files
-        but are not often used. Pose is important for algorithms that
-        take advantage of the temporal information.
-
-        Args:
-            frame (:obj:`Frame`): Open dataset frame proto.
-            file_idx (int): Current file index.
-            frame_idx (int): Current frame index.
-        """
-        pose = np.array(frame.pose.transform).reshape(4, 4)
-        np.savetxt(
-            join(f'{self.pose_save_dir}/{self.prefix}' +
-                 f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'),
-            pose)
-
-    def save_timestamp(self, frame, file_idx, frame_idx):
-        """Save the timestamp data in a separate file instead of the
-        pointcloud.
-
-        Note that SDC's own pose is not included in the regular training
-        of KITTI dataset. KITTI raw dataset contains ego motion files
-        but are not often used. Pose is important for algorithms that
-        take advantage of the temporal information.
-
-        Args:
-            frame (:obj:`Frame`): Open dataset frame proto.
-            file_idx (int): Current file index.
-            frame_idx (int): Current frame index.
-        """
-        with open(
-                join(f'{self.timestamp_save_dir}/{self.prefix}' +
-                     f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'),
-                'w') as f:
-            f.write(str(frame.timestamp_micros))
-
-    def create_folder(self):
-        """Create folder for data preprocessing."""
-        if not self.test_mode:
-            dir_list1 = [
-                self.label_all_save_dir,
-                self.calib_save_dir,
-                self.pose_save_dir,
-                self.timestamp_save_dir,
-            ]
-            dir_list2 = [self.label_save_dir, self.image_save_dir]
-            if self.save_cam_sync_labels:
-                dir_list1.append(self.cam_sync_label_all_save_dir)
-                dir_list2.append(self.cam_sync_label_save_dir)
-        else:
-            dir_list1 = [
-                self.calib_save_dir, self.pose_save_dir,
-                self.timestamp_save_dir
-            ]
-            dir_list2 = [self.image_save_dir]
-        if 'testing_3d_camera_only_detection' not in self.load_dir:
-            dir_list1.append(self.point_cloud_save_dir)
-        for d in dir_list1:
-            mmengine.mkdir_or_exist(d)
-        for d in dir_list2:
-            for i in range(5):
-                mmengine.mkdir_or_exist(f'{d}{str(i)}')
-
     def convert_range_image_to_point_cloud(self,
                                            frame,
                                            range_images,
@@ -604,29 +406,317 @@ def cart_to_homo(self, mat):
             raise ValueError(mat.shape)
         return ret
 
+    def create_waymo_info_file(self, frame, file_idx, frame_idx, file_infos):
+        """Generate waymo train/val/test infos.
+
+        For more details about infos, please refer to:
+        https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html
+        """  # noqa: E501
+        frame_infos = dict()
+
+        # Gather frame infos
+        sample_idx = \
+            f'{self.prefix}{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}'
+        frame_infos['sample_idx'] = int(sample_idx)
+        frame_infos['timestamp'] = frame.timestamp_micros
+        frame_infos['ego2global'] = np.array(frame.pose.transform).reshape(
+            4, 4).astype(np.float32).tolist()
+        frame_infos['context_name'] = frame.context.name
+
+        # Gather camera infos
+        frame_infos['images'] = dict()
+        # waymo front camera to kitti reference camera
+        T_front_cam_to_ref = np.array([[0.0, -1.0, 0.0], [0.0, 0.0, -1.0],
+                                       [1.0, 0.0, 0.0]])
+        camera_calibs = []
+        Tr_velo_to_cams = []
+        for camera in frame.context.camera_calibrations:
+            # extrinsic parameters
+            T_cam_to_vehicle = np.array(camera.extrinsic.transform).reshape(
+                4, 4)
+            T_vehicle_to_cam = np.linalg.inv(T_cam_to_vehicle)
+            Tr_velo_to_cam = \
+                self.cart_to_homo(T_front_cam_to_ref) @ T_vehicle_to_cam
+            Tr_velo_to_cams.append(Tr_velo_to_cam)
+
+            # intrinsic parameters
+            camera_calib = np.zeros((3, 4))
+            camera_calib[0, 0] = camera.intrinsic[0]
+            camera_calib[1, 1] = camera.intrinsic[1]
+            camera_calib[0, 2] = camera.intrinsic[2]
+            camera_calib[1, 2] = camera.intrinsic[3]
+            camera_calib[2, 2] = 1
+            camera_calibs.append(camera_calib)
+
+        for i, (cam_key, camera_calib, Tr_velo_to_cam) in enumerate(
+                zip(self.camera_types, camera_calibs, Tr_velo_to_cams)):
+            cam_infos = dict()
+            cam_infos['img_path'] = str(sample_idx) + '.jpg'
+            # NOTE: frames.images order is different
+            for img in frame.images:
+                if img.name == i + 1:
+                    width, height = Image.open(BytesIO(img.image)).size
+            cam_infos['height'] = height
+            cam_infos['width'] = width
+            cam_infos['lidar2cam'] = Tr_velo_to_cam.astype(np.float32).tolist()
+            cam_infos['cam2img'] = camera_calib.astype(np.float32).tolist()
+            cam_infos['lidar2img'] = (camera_calib @ Tr_velo_to_cam).astype(
+                np.float32).tolist()
+            frame_infos['images'][cam_key] = cam_infos
+
+        # Gather lidar infos
+        lidar_infos = dict()
+        lidar_infos['lidar_path'] = str(sample_idx) + '.bin'
+        lidar_infos['num_pts_feats'] = 6
+        frame_infos['lidar_points'] = lidar_infos
+
+        # Gather lidar sweeps and camera sweeps infos
+        # TODO: Add lidar2img in image sweeps infos when we need it.
+        # TODO: Consider merging lidar sweeps infos and image sweeps infos.
+        lidar_sweeps_infos, image_sweeps_infos = [], []
+        for prev_offset in range(-1, -self.max_sweeps - 1, -1):
+            prev_lidar_infos = dict()
+            prev_image_infos = dict()
+            if frame_idx + prev_offset >= 0:
+                prev_frame_infos = file_infos[prev_offset]
+                prev_lidar_infos['timestamp'] = prev_frame_infos['timestamp']
+                prev_lidar_infos['ego2global'] = prev_frame_infos['ego2global']
+                prev_lidar_infos['lidar_points'] = dict()
+                lidar_path = prev_frame_infos['lidar_points']['lidar_path']
+                prev_lidar_infos['lidar_points']['lidar_path'] = lidar_path
+                lidar_sweeps_infos.append(prev_lidar_infos)
+
+                prev_image_infos['timestamp'] = prev_frame_infos['timestamp']
+                prev_image_infos['ego2global'] = prev_frame_infos['ego2global']
+                prev_image_infos['images'] = dict()
+                for cam_key in self.camera_types:
+                    prev_image_infos['images'][cam_key] = dict()
+                    img_path = prev_frame_infos['images'][cam_key]['img_path']
+                    prev_image_infos['images'][cam_key]['img_path'] = img_path
+                image_sweeps_infos.append(prev_image_infos)
+        if lidar_sweeps_infos:
+            frame_infos['lidar_sweeps'] = lidar_sweeps_infos
+        if image_sweeps_infos:
+            frame_infos['image_sweeps'] = image_sweeps_infos
+
+        if not self.test_mode:
+            # Gather instances infos which is used for lidar-based 3D detection
+            frame_infos['instances'] = self.gather_instance_info(frame)
+            # Gather cam_sync_instances infos which is used for image-based
+            # (multi-view) 3D detection.
+            if self.save_cam_sync_instances:
+                frame_infos['cam_sync_instances'] = self.gather_instance_info(
+                    frame, cam_sync=True)
+            # Gather cam_instances infos which is used for image-based
+            # (monocular) 3D detection (optional).
+            # TODO: Should we use cam_sync_instances to generate cam_instances?
+            if self.save_cam_instances:
+                frame_infos['cam_instances'] = self.gather_cam_instance_info(
+                    copy.deepcopy(frame_infos['instances']),
+                    frame_infos['images'])
+        file_infos.append(frame_infos)
+
+    def gather_instance_info(self, frame, cam_sync=False):
+        """Generate instances and cam_sync_instances infos.
+
+        For more details about infos, please refer to:
+        https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html
+        """  # noqa: E501
+        id_to_bbox = dict()
+        id_to_name = dict()
+        for labels in frame.projected_lidar_labels:
+            name = labels.name
+            for label in labels.labels:
+                # TODO: need a workaround as bbox may not belong to front cam
+                bbox = [
+                    label.box.center_x - label.box.length / 2,
+                    label.box.center_y - label.box.width / 2,
+                    label.box.center_x + label.box.length / 2,
+                    label.box.center_y + label.box.width / 2
+                ]
+                id_to_bbox[label.id] = bbox
+                id_to_name[label.id] = name - 1
+
+        group_id = 0
+        instance_infos = []
+        for obj in frame.laser_labels:
+            instance_info = dict()
+            bounding_box = None
+            name = None
+            id = obj.id
+            for proj_cam in self.cam_list:
+                if id + proj_cam in id_to_bbox:
+                    bounding_box = id_to_bbox.get(id + proj_cam)
+                    name = id_to_name.get(id + proj_cam)
+                    break
+
+            # NOTE: the 2D labels do not have strict correspondence with
+            # the projected 2D lidar labels
+            # e.g.: the projected 2D labels can be in camera 2
+            # while the most_visible_camera can have id 4
+            if cam_sync:
+                if obj.most_visible_camera_name:
+                    name = self.cam_list.index(
+                        f'_{obj.most_visible_camera_name}')
+                    box3d = obj.camera_synced_box
+                else:
+                    continue
+            else:
+                box3d = obj.box
+
+            if bounding_box is None or name is None:
+                name = 0
+                bounding_box = [0.0, 0.0, 0.0, 0.0]
+
+            my_type = self.type_list[obj.type]
+
+            if my_type not in self.selected_waymo_classes:
+                continue
+            else:
+                label = self.selected_waymo_classes.index(my_type)
+
+            if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1:
+                continue
+
+            group_id += 1
+            instance_info['group_id'] = group_id
+            instance_info['camera_id'] = name
+            instance_info['bbox'] = bounding_box
+            instance_info['bbox_label'] = label
+
+            height = box3d.height
+            width = box3d.width
+            length = box3d.length
+
+            # NOTE: We save the bottom center of 3D bboxes.
+            x = box3d.center_x
+            y = box3d.center_y
+            z = box3d.center_z - height / 2
+
+            rotation_y = box3d.heading
+
+            instance_info['bbox_3d'] = np.array(
+                [x, y, z, length, width, height,
+                 rotation_y]).astype(np.float32).tolist()
+            instance_info['bbox_label_3d'] = label
+            instance_info['num_lidar_pts'] = obj.num_lidar_points_in_box
+
+            if self.save_track_id:
+                instance_info['track_id'] = obj.id
+            instance_infos.append(instance_info)
+        return instance_infos
+
+    def gather_cam_instance_info(self, instances: dict, images: dict):
+        """Generate cam_instances infos.
+
+        For more details about infos, please refer to:
+        https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html
+        """  # noqa: E501
+        cam_instances = dict()
+        for cam_type in self.camera_types:
+            lidar2cam = np.array(images[cam_type]['lidar2cam'])
+            cam2img = np.array(images[cam_type]['cam2img'])
+            cam_instances[cam_type] = []
+            for instance in instances:
+                cam_instance = dict()
+                gt_bboxes_3d = np.array(instance['bbox_3d'])
+                # Convert lidar coordinates to camera coordinates
+                gt_bboxes_3d = LiDARInstance3DBoxes(
+                    gt_bboxes_3d[None, :]).convert_to(
+                        Box3DMode.CAM, lidar2cam, correct_yaw=True)
+                corners_3d = gt_bboxes_3d.corners.numpy()
+                corners_3d = corners_3d[0].T  # (1, 8, 3) -> (3, 8)
+                in_camera = np.argwhere(corners_3d[2, :] > 0).flatten()
+                corners_3d = corners_3d[:, in_camera]
+                # Project 3d box to 2d.
+                corner_coords = view_points(corners_3d, cam2img,
+                                            True).T[:, :2].tolist()
+
+                # Keep only corners that fall within the image.
+                # TODO: imsize should be determined by the current image size
+                # CAM_FRONT: (1920, 1280)
+                # CAM_FRONT_LEFT: (1920, 1280)
+                # CAM_SIDE_LEFT: (1920, 886)
+                final_coords = post_process_coords(
+                    corner_coords,
+                    imsize=(images['CAM_FRONT']['width'],
+                            images['CAM_FRONT']['height']))
+
+                # Skip if the convex hull of the re-projected corners
+                # does not intersect the image canvas.
+                if final_coords is None:
+                    continue
+                else:
+                    min_x, min_y, max_x, max_y = final_coords
+
+                cam_instance['bbox'] = [min_x, min_y, max_x, max_y]
+                cam_instance['bbox_label'] = instance['bbox_label']
+                cam_instance['bbox_3d'] = gt_bboxes_3d.numpy().squeeze(
+                ).astype(np.float32).tolist()
+                cam_instance['bbox_label_3d'] = instance['bbox_label_3d']
+
+                center_3d = gt_bboxes_3d.gravity_center.numpy()
+                center_2d_with_depth = points_cam2img(
+                    center_3d, cam2img, with_depth=True)
+                center_2d_with_depth = center_2d_with_depth.squeeze().tolist()
+
+                # normalized center2D + depth
+                # if samples with depth < 0 will be removed
+                if center_2d_with_depth[2] <= 0:
+                    continue
+                cam_instance['center_2d'] = center_2d_with_depth[:2]
+                cam_instance['depth'] = center_2d_with_depth[2]
+
+                # TODO: Discuss whether following info is necessary
+                cam_instance['bbox_3d_isvalid'] = True
+                cam_instance['velocity'] = -1
+                cam_instances[cam_type].append(cam_instance)
+
+        return cam_instances
+
+    def merge_trainval_infos(self):
+        """Merge training and validation infos into a single file."""
+        train_infos_path = osp.join(
+            osp.dirname(self.save_dir), f'{self.info_prefix}_infos_train.pkl')
+        val_infos_path = osp.join(
+            osp.dirname(self.save_dir), f'{self.info_prefix}_infos_val.pkl')
+        train_infos = mmengine.load(train_infos_path)
+        val_infos = mmengine.load(val_infos_path)
+        trainval_infos = dict(
+            metainfo=train_infos['metainfo'],
+            data_list=train_infos['data_list'] + val_infos['data_list'])
+        mmengine.dump(
+            trainval_infos,
+            osp.join(
+                osp.dirname(self.save_dir),
+                f'{self.info_prefix}_infos_trainval.pkl'))
+
 
 def create_ImageSets_img_ids(root_dir, splits):
+    """Create txt files indicating what to collect in each split."""
     save_dir = join(root_dir, 'ImageSets/')
     if not exists(save_dir):
         os.mkdir(save_dir)
 
-    idx_all = [[] for i in splits]
+    idx_all = [[] for _ in splits]
     for i, split in enumerate(splits):
-        path = join(root_dir, splits[i], 'calib')
+        path = join(root_dir, split, 'image_0')
         if not exists(path):
             RawNames = []
         else:
             RawNames = os.listdir(path)
 
         for name in RawNames:
-            if name.endswith('.txt'):
-                idx = name.replace('.txt', '\n')
+            if name.endswith('.jpg'):
+                idx = name.replace('.jpg', '\n')
                 idx_all[int(idx[0])].append(idx)
         idx_all[i].sort()
 
     open(save_dir + 'train.txt', 'w').writelines(idx_all[0])
     open(save_dir + 'val.txt', 'w').writelines(idx_all[1])
     open(save_dir + 'trainval.txt', 'w').writelines(idx_all[0] + idx_all[1])
-    open(save_dir + 'test.txt', 'w').writelines(idx_all[2])
-    # open(save_dir+'test_cam_only.txt','w').writelines(idx_all[3])
+    if len(idx_all) >= 3:
+        open(save_dir + 'test.txt', 'w').writelines(idx_all[2])
+    if len(idx_all) >= 4:
+        open(save_dir + 'test_cam_only.txt', 'w').writelines(idx_all[3])
     print('created txt files indicating what to collect in ', splits)