diff --git a/README.md b/README.md index 51f9bc8..23be45f 100644 --- a/README.md +++ b/README.md @@ -193,6 +193,10 @@ To inference and evaluate the model (e.g., the checkpoint `work_dirs/mv-3ddet/ep python tools/test.py configs/detection/mv-det3d_8xb4_embodiedscan-3d-284class-9dof.py work_dirs/mv-3ddet/epoch_12.pth --launcher="pytorch" ``` +### Using Visualizer during inference + +We provide EmbodiedScanBaseVisualizer to visualize the output of models during inference. Please refer to the [guide](embodiedscan/visualizer/README.md) for detail. + ### Inference and Submit your Results We preliminarily support format-only inference for multi-view 3D visual grounding. To achieve format-only inference during test, just set `format_only=True` in `test_evaluator` in the corresponding config like [here](https://github.com/OpenRobotLab/EmbodiedScan/blob/main/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof.py#L183). Then just run the test script like: diff --git a/embodiedscan/datasets/embodiedscan_dataset.py b/embodiedscan/datasets/embodiedscan_dataset.py index 1305172..0681380 100644 --- a/embodiedscan/datasets/embodiedscan_dataset.py +++ b/embodiedscan/datasets/embodiedscan_dataset.py @@ -105,6 +105,7 @@ def parse_data_info(self, info: dict) -> dict: """ info['box_type_3d'] = self.box_type_3d info['axis_align_matrix'] = self._get_axis_align_matrix(info) + info['scan_id'] = info['sample_idx'] ann_dataset = info['sample_idx'].split('/')[0] if ann_dataset == 'matterport3d': info['depth_shift'] = 4000.0 diff --git a/embodiedscan/datasets/transforms/formatting.py b/embodiedscan/datasets/transforms/formatting.py index e849ca9..ee7e8d5 100644 --- a/embodiedscan/datasets/transforms/formatting.py +++ b/embodiedscan/datasets/transforms/formatting.py @@ -75,7 +75,7 @@ def __init__( 'cam2global', 'crop_offset', 'img_crop_offset', 'resize_img_shape', 'lidar2cam', 'ori_lidar2img', 'num_ref_frames', 'num_views', 'ego2global', 'fov_ori2aug', 'ego2cam', 'axis_align_matrix', - 'text', 'tokens_positive')): + 'text', 'tokens_positive', 'scan_id')): self.keys = keys self.meta_keys = meta_keys diff --git a/embodiedscan/registry.py b/embodiedscan/registry.py index eab51ae..1593d25 100644 --- a/embodiedscan/registry.py +++ b/embodiedscan/registry.py @@ -3,6 +3,8 @@ from mmengine import MODELS as MMENGINE_MODELS from mmengine import TASK_UTILS as MMENGINE_TASK_UTILS from mmengine import TRANSFORMS as MMENGINE_TRANSFORMS +from mmengine import VISBACKENDS as MMENGINE_VISBACKENDS +from mmengine import VISUALIZERS as MMENGINE_VISUALIZERS from mmengine import Registry MODELS = Registry('model', @@ -20,3 +22,10 @@ TASK_UTILS = Registry('task util', parent=MMENGINE_TASK_UTILS, locations=['embodiedscan.models']) +VISUALIZERS = Registry('visualizer', + parent=MMENGINE_VISUALIZERS, + locations=['embodiedscan.visualizer']) +# manage visualizer backend +VISBACKENDS = Registry('vis_backend', + parent=MMENGINE_VISBACKENDS, + locations=['embodiedscan.visualizer']) diff --git a/embodiedscan/visualization/utils.py b/embodiedscan/visualization/utils.py index eab1a08..ac1f0e8 100644 --- a/embodiedscan/visualization/utils.py +++ b/embodiedscan/visualization/utils.py @@ -1,6 +1,7 @@ import cv2 import numpy as np import open3d as o3d +from torch import Tensor from .line_mesh import LineMesh @@ -39,9 +40,24 @@ def _box_add_thickness(box, thickness): return results -def _9dof_to_box(box, label, color_selector): +def _9dof_to_box(box, label=None, color_selector=None, color=None): + """Convert 9-DoF box from array/tensor to open3d.OrientedBoundingBox. + + Args: + box (numpy.ndarray|torch.Tensor|List[float]): + 9-DoF box with shape (9,). + label (int, optional): Label of the box. Defaults to None. + color_selector (:obj:`ColorSelector`, optional): + Color selector for boxes. Defaults to None. + color (tuple[int], optional): Color of the box. + You can directly specify the color. + If you do, the color_selector and label will be ignored. + Defaults to None. + """ if isinstance(box, list): box = np.array(box) + if isinstance(box, Tensor): + box = box.cpu().numpy() center = box[:3].reshape(3, 1) scale = box[3:6].reshape(3, 1) rot = box[6:].reshape(3, 1) @@ -49,13 +65,81 @@ def _9dof_to_box(box, label, color_selector): rot) geo = o3d.geometry.OrientedBoundingBox(center, rot_mat, scale) - color = color_selector.get_color(label) - color = [x / 255.0 for x in color] - geo.color = color + if color is not None: + geo.color = [x / 255.0 for x in color] + return geo + + if label is not None and color_selector is not None: + color = color_selector.get_color(label) + color = [x / 255.0 for x in color] + geo.color = color return geo +def nms_filter(pred_results, iou_thr=0.15, score_thr=0.075, topk_per_class=10): + """Non-Maximum Suppression for 3D Euler boxes. Additionally, only the top-k + boxes will be kept for each category to avoid redundant boxes in the + visualization. + + Args: + pred_results (:obj:`InstanceData`): + Results predicted by the model. + iou_thr (float): IoU thresholds for NMS. Defaults to 0.15. + score_thr (float): Score thresholds. + Instances with scores below thresholds will not be kept. + Defaults to 0.075. + topk_per_class (int): Number of instances kept per category. + Defaults to 10. + + Returns: + numpy.ndarray[float], np.ndarray[int]: + Filtered boxes with shape (N, 9) and labels with shape (N,). + """ + boxes = pred_results.bboxes_3d + boxes_tensor = boxes.tensor.cpu().numpy() + iou = boxes.overlaps(boxes, boxes, eps=1e-5) + score = pred_results.scores_3d.cpu().numpy() + label = pred_results.labels_3d.cpu().numpy() + selected_per_class = dict() + + n = boxes_tensor.shape[0] + idx = list(range(n)) + idx.sort(key=lambda x: score[x], reverse=True) + selected_idx = [] + for i in idx: + if selected_per_class.get(label[i], 0) >= topk_per_class: + continue + if score[i] < score_thr: + continue + bo = False + for j in selected_idx: + if iou[i][j] > iou_thr: + bo = True + break + if not bo: + selected_idx.append(i) + if label[i] not in selected_per_class: + selected_per_class[label[i]] = 1 + else: + selected_per_class[label[i]] += 1 + + return boxes_tensor[selected_idx], label[selected_idx] + + def draw_camera(camera_pose, camera_size=0.5, return_points=False): + """Draw the camera pose in the form of a cone. + + Args: + camera_pose (numpy.ndarray): 4x4 camera pose from camera to world. + camera_size (float): Size of the camera cone. Defaults to 0.5. + return_points (bool): Whether to return the points of the camera cone. + Defaults to False. + + Returns: + numpy.ndarray | :obj:`LineSet`: + if return_points is True, return the points of the camera cone. + Otherwise, return the camera cone as an open3d.LineSet. + """ # camera_pose : 4*4 camera to world point = np.array([[0, 0, 0], [-camera_size, -camera_size, camera_size * 2], [camera_size, -camera_size, camera_size * 2], diff --git a/embodiedscan/visualizer/README.md b/embodiedscan/visualizer/README.md new file mode 100644 index 0000000..bac4852 --- /dev/null +++ b/embodiedscan/visualizer/README.md @@ -0,0 +1,29 @@ +### EmbodiedScanBaseVisualizer Simple Tutorial + +To use visualizer, you need to specify the visualizer in the config. Add the following command to your config file. + +```Python +visualizer = dict(type='EmbodiedScanBaseVisualizer', vis_backends=[dict(type='LocalVisBackend')], save_dir='temp_dir') +``` + +Then call the visualizer in models. + +```Python +def predict(self, batch_inputs_dict, batch_data_samples, **kwargs): + x = self.extract_feat(batch_inputs_dict, batch_data_samples) + results_list = self.bbox_head.predict(x, batch_data_samples, **kwargs) + predictions = self.add_pred_to_datasample(batch_data_samples, results_list) + + # visualization + from embodiedscan.visualizer import EmbodiedScanBaseVisualizer + visualizer = EmbodiedScanBaseVisualizer.get_current_instance() + visualizer.visualize_scene(predictions) + + return predictions +``` + +The visualizer will apply Non-Maximum Suppression(NMS) to avoid redundant boxes in the visualization. You can specify its parameters by passing nms_args. + +```Python +visualizer.visualize_scene(predictions, nms_args = dict(iou_thr = 0.15, score_thr = 0.075, topk_per_class = 10)) +``` diff --git a/embodiedscan/visualizer/__init__.py b/embodiedscan/visualizer/__init__.py new file mode 100644 index 0000000..e67de9d --- /dev/null +++ b/embodiedscan/visualizer/__init__.py @@ -0,0 +1,3 @@ +from .base_visualizer import EmbodiedScanBaseVisualizer + +__all__ = ['EmbodiedScanBaseVisualizer'] diff --git a/embodiedscan/visualizer/base_visualizer.py b/embodiedscan/visualizer/base_visualizer.py new file mode 100644 index 0000000..a98f1e6 --- /dev/null +++ b/embodiedscan/visualizer/base_visualizer.py @@ -0,0 +1,132 @@ +import os + +from mmengine.dist import master_only +from mmengine.visualization import Visualizer + +from embodiedscan.registry import VISUALIZERS + +try: + import open3d as o3d + + from embodiedscan.visualization.utils import _9dof_to_box, nms_filter +except ImportError: + o3d = None + + +@VISUALIZERS.register_module() +class EmbodiedScanBaseVisualizer(Visualizer): + """EmbodiedScan Base Visualizer. Method to visualize 3D scenes and Euler + boxes. + + Args: + name (str): Name of the visualizer. Defaults to 'visualizer'. + save_dir (str, optional): Directory to save visualizations. + Defaults to None. + vis_backends (list[ConfigType], optional): + List of visualization backends to use. Defaluts to None. + """ + + def __init__(self, + name: str = 'visualizer', + save_dir: str = None, + vis_backends=None) -> None: + super().__init__(name=name, + vis_backends=vis_backends, + save_dir=save_dir) + + if o3d is None: + raise ImportError('Please install open3d.') + + @staticmethod + def get_root_dir(img_path): + """Get the root directory of the dataset.""" + if 'posed_images' in img_path: + return img_path.split('posed_images')[0] + if 'sequence' in img_path: + return img_path.split('sequence')[0] + if 'matterport_color_images' in img_path: + return img_path.split('matterport_color_images')[0] + raise ValueError('Custom datasets are not supported.') + + @staticmethod + def get_ply(root_dir, scene_name): + """Get the path of the ply file.""" + s = scene_name.split('/') + if len(s) == 2: + dataset, region = s + else: + dataset, building, region = s + if dataset == 'scannet': + filepath = os.path.join(root_dir, 'scans', region, + f'{region}_vh_clean.ply') + elif dataset == '3rscan': + filepath = os.path.join(root_dir, 'mesh.refined.v2.obj') + elif dataset == 'matterport3d': + filepath = os.path.join(root_dir, 'region_segmentations', + f'{region}.ply') + else: + raise NotImplementedError + return filepath + + @master_only + def visualize_scene(self, + data_samples, + class_filter=None, + nms_args=dict(iou_thr=0.15, + score_thr=0.075, + topk_per_class=10)): + """Visualize the 3D scene with 3D boxes. + + Args: + data_samples (list[:obj:`Det3DDataSample`]): + The output of the model. + class_filter (int, optional): Class filter for visualization. + Default to None to show all classes. + nms_args (dict): NMS arguments for filtering boxes. + Defaults to dict(iou_thr = 0.15, + score_thr = 0.075, + topk_per_class = 10). + """ + assert len(data_samples) == 1 + data_sample = data_samples[0] + + metainfo = data_sample.metainfo + pred = data_sample.pred_instances_3d + gt = data_sample.eval_ann_info + + if not hasattr(pred, 'labels_3d'): + assert gt['gt_labels_3d'].shape[0] == 1 + gt_label = gt['gt_labels_3d'][0].item() + _ = pred.bboxes_3d.tensor.shape[0] + pseudo_label = pred.bboxes_3d.tensor.new_ones(_, ) * gt_label + pred.labels_3d = pseudo_label + pred_box, pred_label = nms_filter(pred, **nms_args) + + root_dir = self.get_root_dir(metainfo['img_path'][0]) + ply_file = self.get_ply(root_dir, metainfo['scan_id']) + axis_align_matrix = metainfo['axis_align_matrix'] + + mesh = o3d.io.read_triangle_mesh(ply_file, True) + mesh.transform(axis_align_matrix) + frame = o3d.geometry.TriangleMesh.create_coordinate_frame() + boxes = [] + # pred 3D box + n = pred_box.shape[0] + for i in range(n): + box = pred_box[i] + label = pred_label[i] + if class_filter is not None and label != class_filter: + continue + box_geo = _9dof_to_box(box, color=(255, 0, 0)) + boxes.append(box_geo) + # gt 3D box + m = gt['gt_bboxes_3d'].tensor.shape[0] + for i in range(m): + box = gt['gt_bboxes_3d'].tensor[i] + label = gt['gt_labels_3d'][i] + if class_filter is not None and label != class_filter: + continue + box_geo = _9dof_to_box(box, color=(0, 255, 0)) + boxes.append(box_geo) + + o3d.visualization.draw_geometries([mesh, frame] + boxes)