diff --git a/src/otx/algo/detection/heads/rtdetr_decoder.py b/src/otx/algo/detection/heads/rtdetr_decoder.py index d60be84f12d..0c39358ed6d 100644 --- a/src/otx/algo/detection/heads/rtdetr_decoder.py +++ b/src/otx/algo/detection/heads/rtdetr_decoder.py @@ -236,8 +236,7 @@ def forward( value = self.value_proj(value) if value_mask is not None: value = value.masked_fill(value_mask[..., None], float(0)) - # value_mask = value_mask.astype(value.dtype).unsqueeze(-1) - # value3 = value * value_mask.unsqueeze(-1) + value = value.reshape(bs, len_v, self.num_heads, self.head_dim) sampling_offsets = self.sampling_offsets(query).reshape( diff --git a/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py b/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py index a10c3c9dfd0..c2a5a0700e9 100644 --- a/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py +++ b/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py @@ -33,7 +33,7 @@ def __init__( num_pos_feats (int): Number of positional features. temperature (int): Temperature scaling factor. normalize (bool): Flag indicating whether to normalize the position embeddings. - scale (Optional[float]): Scaling factor for the position embeddings. If None, default value is used. + scale (float | None): Scaling factor for the position embeddings. If None, default value is used. """ super().__init__() self.num_pos_feats = num_pos_feats @@ -132,7 +132,7 @@ def __init__( Args: backbone (nn.Module): The backbone module. - position_embedding (Union[PositionEmbeddingSine]): The position embedding module. + position_embedding (PositionEmbeddingSine): The position embedding module. """ super().__init__(backbone, position_embedding) self.strides = backbone.strides diff --git a/src/otx/algo/object_detection_3d/detectors/monodetr.py b/src/otx/algo/object_detection_3d/detectors/monodetr.py index d2cb4691b57..00ff404c718 100644 --- a/src/otx/algo/object_detection_3d/detectors/monodetr.py +++ b/src/otx/algo/object_detection_3d/detectors/monodetr.py @@ -149,12 +149,17 @@ def forward( """Forward method of the MonoDETR model. Args: - images (list[Tensor]): images for each sample - calibs (Tensor): camera matrices for each sample - img_sizes (Tensor): image sizes for each sample - targets (list[dict[Tensor]): ground truth boxes and labels for each - sample + images (Tensor): images for each sample. + calibs (Tensor): camera matrices for each sample. + img_sizes (Tensor): image sizes for each sample. + targets (list[dict[str, Tensor]): ground truth boxes and labels for each + sample. Defaults to None. mode (str): The mode of operation. Defaults to "predict". + + Returns: + dict[str, Tensor]: A dictionary of tensors. If mode is "loss", the + tensors are the loss values. If mode is "predict", the tensors are + the logits. """ features, pos = self.backbone(images) diff --git a/src/otx/algo/object_detection_3d/heads/depth_predictor.py b/src/otx/algo/object_detection_3d/heads/depth_predictor.py index 4e5037c96d8..87827144b21 100644 --- a/src/otx/algo/object_detection_3d/heads/depth_predictor.py +++ b/src/otx/algo/object_detection_3d/heads/depth_predictor.py @@ -32,6 +32,8 @@ def __init__( depth_min (float): The minimum depth value. depth_max (float): The maximum depth value. hidden_dim (int): The dimension of the hidden layer. + activation (Callable[..., nn.Module], optional): The activation function. + Defaults to nn.ReLU. """ super().__init__() self.depth_max = depth_max diff --git a/src/otx/algo/object_detection_3d/losses/ddn_loss.py b/src/otx/algo/object_detection_3d/losses/ddn_loss.py index e3a4238be03..671033a347a 100644 --- a/src/otx/algo/object_detection_3d/losses/ddn_loss.py +++ b/src/otx/algo/object_detection_3d/losses/ddn_loss.py @@ -22,13 +22,13 @@ def compute_fg_mask( """Compute foreground mask for images. Args: - gt_boxes2d [torch.Tensor(B, N, 4)]: 2D box labels - shape [Tuple[int, int]]: Foreground mask desired shape - downsample_factor [int]: Downsample factor for image - device [torch.device]: Foreground mask desired device + gt_boxes2d (torch.Tensor): 2D box labels. + shape (Tuple[int, int]): Foreground mask desired shape. + downsample_factor (int): Downsample factor for image. + device (torch.device): Foreground mask desired device. Returns: - fg_mask [torch.Tensor(shape)]: Foreground mask + fg_mask (torch.Tensor(shape)]: Foreground mask. """ if device is None: device = torch.device("cpu") @@ -58,9 +58,9 @@ def __init__(self, fg_weight: float, bg_weight: float, downsample_factor: int = """Initialize fixed foreground/background loss balancer. Args: - fg_weight [float]: Foreground loss weight - bg_weight [float]: Background loss weight - downsample_factor [int]: Depth map downsample factor + fg_weight (float): Foreground loss weight. + bg_weight (float): Background loss weight. + downsample_factor (int): Depth map downsample factor. """ super().__init__() self.fg_weight = fg_weight @@ -76,12 +76,11 @@ def forward( """Forward pass. Args: - loss [torch.Tensor(B, H, W)]: Pixel-wise loss - gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing + loss (torch.Tensor): Pixel-wise loss. + gt_boxes2d (torch.Tensor): 2D box labels for foreground/background balancing. Returns: - loss [torch.Tensor(1)]: Total loss after foreground/background balancing - tb_dict [dict[float]]: All losses to log in tensorboard + loss (torch.Tensor): Total loss after foreground/background balancing. """ # Compute masks fg_mask = compute_fg_mask( @@ -120,13 +119,11 @@ def __init__( """Initializes DDNLoss module. Args: - weight [float]: Loss function weight - alpha [float]: Alpha value for Focal Loss - gamma [float]: Gamma value for Focal Loss - disc_cfg [dict]: Depth discretiziation configuration - fg_weight [float]: Foreground loss weight - bg_weight [float]: Background loss weight - downsample_factor [int]: Depth map downsample factor + alpha (float): Alpha value for Focal Loss. + gamma (float): Gamma value for Focal Loss. + fg_weight (float): Foreground loss weight. + bg_weight (float): Background loss weight. + downsample_factor (int): Depth map downsample factor. """ super().__init__() self.balancer = Balancer(downsample_factor=downsample_factor, fg_weight=fg_weight, bg_weight=bg_weight) @@ -146,10 +143,10 @@ def build_target_depth_from_3dcenter( """Builds target depth map from 3D center depth. Args: - depth_logits: torch.Tensor(B, D+1, H, W)]: Predicted depth logits - gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing - gt_center_depth [torch.Tensor(B, N)]: 3D center depth - num_gt_per_img: [int]: Number of ground truth boxes per image + depth_logits: (torch.Tensor): Predicted depth logits. + gt_boxes2d (torch.Tensor)): 2D box labels for foreground/background balancing. + gt_center_depth (torch.Tensor): 3D center depth. + num_gt_per_img: (int): Number of ground truth boxes per image. """ b, _, h, w = depth_logits.shape depth_maps = torch.zeros((b, h, w), device=depth_logits.device, dtype=depth_logits.dtype) @@ -185,18 +182,18 @@ def bin_depths( """Converts depth map into bin indices. Args: - depth_map [torch.Tensor(H, W)]: Depth Map - mode [string]: Discretiziation mode (See https://arxiv.org/pdf/2005.13423.pdf for more details) - UD: Uniform discretiziation - LID: Linear increasing discretiziation - SID: Spacing increasing discretiziation - depth_min [float]: Minimum depth value - depth_max [float]: Maximum depth value - num_bins [int]: Number of depth bins - target [bool]: Whether the depth bins indices will be used for a target tensor in loss comparison + depth_map (torch.Tensor): Depth Map. + mode (string): Discretiziation mode (See https://arxiv.org/pdf/2005.13423.pdf for more details). + UD: Uniform discretiziation. + LID: Linear increasing discretiziation. + SID: Spacing increasing discretiziation. + depth_min (float): Minimum depth value. + depth_max (float): Maximum depth value. + num_bins (int): Number of depth bins. + target (bool): Whether the depth bins indices will be used for a target tensor in loss comparison. Returns: - indices [torch.Tensor(H, W)]: Depth bin indices + indices (torch.Tensor): Depth bin indices. """ if mode == "UD": bin_size = (depth_max - depth_min) / num_bins @@ -233,13 +230,13 @@ def forward( """Gets depth_map loss. Args: - depth_logits: torch.Tensor(B, D+1, H, W)]: Predicted depth logits - gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing - num_gt_per_img: [int]: Number of ground truth boxes per image - gt_center_depth: [torch.Tensor(B, N)]: 3D center depth + depth_logits: (torch.Tensor): Predicted depth logits. + gt_boxes2d (torch.Tensor): 2D box labels for foreground/background balancing. + num_gt_per_img: (int): Number of ground truth boxes per image. + gt_center_depth: (torch.Tensor): 3D center depth. Returns: - loss [torch.Tensor(1)]: Depth classification network loss + loss (torch.Tensor): Depth classification network loss. """ # Bin depth map to create target depth_maps = self.build_target_depth_from_3dcenter(depth_logits, gt_boxes2d, gt_center_depth, num_gt_per_img) diff --git a/src/otx/algo/object_detection_3d/losses/monodetr_loss.py b/src/otx/algo/object_detection_3d/losses/monodetr_loss.py index ebc98d45a51..0f2d85d0565 100644 --- a/src/otx/algo/object_detection_3d/losses/monodetr_loss.py +++ b/src/otx/algo/object_detection_3d/losses/monodetr_loss.py @@ -29,11 +29,10 @@ def __init__(self, num_classes: int, weight_dict: dict, focal_alpha: float, grou """MonoDETRCriterion. Args: - num_classes: number of object categories, omitting the special no-object category - matcher: module able to compute a matching between targets and proposals - weight_dict: dict containing as key the names of the losses and as values their relative weight. - focal_alpha: alpha in Focal Loss - group_num: number of groups for data parallelism + num_classes (int): number of object categories, omitting the special no-object category. + weight_dict (dict): dict containing as key the names of the losses and as values their relative weight. + focal_alpha (float): alpha in Focal Loss. + group_num (int): number of groups for data parallelism. """ super().__init__() self.num_classes = num_classes @@ -47,7 +46,15 @@ def __init__(self, num_classes: int, weight_dict: dict, focal_alpha: float, grou self.group_num = group_num def loss_labels(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: - """Classification loss.""" + """Classification loss. + + Args: + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. + indices (list): list of tuples, such that len(indices) == batch_size. + num_boxes (int): number of boxes in the batch. + """ src_logits = outputs["scores"] idx = self._get_src_permutation_idx(indices) @@ -76,7 +83,15 @@ def loss_labels(self, outputs: dict, targets: list, indices: list, num_boxes: in return {"loss_ce": loss_ce} def loss_3dcenter(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: - """Compute the loss for the 3D center prediction.""" + """Compute the loss for the 3D center prediction. + + Args: + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. + indices (list): list of tuples, such that len(indices) == batch_size. + num_boxes (int): number of boxes in the batch. + """ idx = self._get_src_permutation_idx(indices) src_3dcenter = outputs["boxes_3d"][:, :, 0:2][idx] target_3dcenter = torch.cat([t["boxes_3d"][:, 0:2][i] for t, (_, i) in zip(targets, indices)], dim=0) @@ -85,7 +100,15 @@ def loss_3dcenter(self, outputs: dict, targets: list, indices: list, num_boxes: return {"loss_center": loss_3dcenter.sum() / num_boxes} def loss_boxes(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: - """Compute l1 loss.""" + """Compute l1 loss. + + Args: + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. + indices (list): list of tuples, such that len(indices) == batch_size. + num_boxes (int): number of boxes in the batch. + """ idx = self._get_src_permutation_idx(indices) src_2dboxes = outputs["boxes_3d"][:, :, 2:6][idx] target_2dboxes = torch.cat([t["boxes_3d"][:, 2:6][i] for t, (_, i) in zip(targets, indices)], dim=0) @@ -95,7 +118,15 @@ def loss_boxes(self, outputs: dict, targets: list, indices: list, num_boxes: int return {"loss_bbox": loss_bbox.sum() / num_boxes} def loss_giou(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: - """Compute the GIoU loss.""" + """Compute the GIoU loss. + + Args: + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. + indices (list): list of tuples, such that len(indices) == batch_size. + num_boxes (int): number of boxes in the batch. + """ # giou idx = self._get_src_permutation_idx(indices) src_boxes = outputs["boxes_3d"][idx] @@ -104,7 +135,15 @@ def loss_giou(self, outputs: dict, targets: list, indices: list, num_boxes: int) return {"loss_giou": loss_giou} def loss_depths(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: - """Compute the loss for the depth prediction.""" + """Compute the loss for the depth prediction. + + Args: + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. + indices (list): list of tuples, such that len(indices) == batch_size. + num_boxes (int): number of boxes in the batch + """ idx = self._get_src_permutation_idx(indices) src_depths = outputs["depth"][idx] @@ -117,7 +156,15 @@ def loss_depths(self, outputs: dict, targets: list, indices: list, num_boxes: in return {"loss_depth": depth_loss.sum() / num_boxes} def loss_dims(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: - """Compute the loss for the dimension prediction.""" + """Compute the loss for the dimension prediction. + + Args: + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. + indices (list): list of tuples, such that len(indices) == batch_size. + num_boxes (int): number of boxes in the batch. + """ idx = self._get_src_permutation_idx(indices) src_dims = outputs["size_3d"][idx] target_dims = torch.cat([t["size_3d"][i] for t, (_, i) in zip(targets, indices)], dim=0) @@ -131,7 +178,15 @@ def loss_dims(self, outputs: dict, targets: list, indices: list, num_boxes: int) return {"loss_dim": dim_loss.sum() / num_boxes} def loss_angles(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: - """Compute the loss for the angle prediction.""" + """Compute the loss for the angle prediction. + + Args: + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. + indices (list): list of tuples, such that len(indices) == batch_size. + num_boxes (int): number of boxes in the batch. + """ idx = self._get_src_permutation_idx(indices) heading_input = outputs["heading_angle"][idx] target_heading_angle = torch.cat([t["heading_angle"][i] for t, (_, i) in zip(targets, indices)], dim=0) @@ -158,7 +213,15 @@ def loss_angles(self, outputs: dict, targets: list, indices: list, num_boxes: in return {"loss_angle": angle_loss.sum() / num_boxes} def loss_depth_map(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: - """Depth map loss.""" + """Depth map loss. + + Args: + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. + indices (list): list of tuples, such that len(indices) == batch_size. + num_boxes (int): number of boxes in the batch. + """ depth_map_logits = outputs["pred_depth_map_logits"] num_gt_per_img = [len(t["boxes"]) for t in targets] @@ -174,6 +237,7 @@ def _get_src_permutation_idx( self, indices: list[tuple[torch.Tensor, torch.Tensor]], ) -> tuple[torch.Tensor, torch.Tensor]: + """Get the indices necessary to compute the loss.""" # permute predictions following indices batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) src_idx = torch.cat([src for (src, _) in indices]) @@ -183,6 +247,7 @@ def _get_tgt_permutation_idx( self, indices: list[tuple[torch.Tensor, torch.Tensor]], ) -> tuple[torch.Tensor, torch.Tensor]: + """Get the indices necessary to compute the loss.""" # permute targets following indices batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) tgt_idx = torch.cat([tgt for (_, tgt) in indices]) @@ -210,9 +275,9 @@ def forward( """This performs the loss computation. Args: - outputs: dict of tensors, see the output specification of the model for the format - targets: list of dicts, such that len(targets) == batch_size. - The expected keys in each dict depends on the losses applied, see each loss' doc + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. """ outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"} group_num = self.group_num if self.training else 1 diff --git a/src/otx/algo/object_detection_3d/monodetr3d.py b/src/otx/algo/object_detection_3d/monodetr3d.py index f0f95ea714f..18d3c072556 100644 --- a/src/otx/algo/object_detection_3d/monodetr3d.py +++ b/src/otx/algo/object_detection_3d/monodetr3d.py @@ -7,19 +7,13 @@ from typing import Any -import numpy as np import torch -from torch import Tensor -from torchvision.ops import box_convert from otx.algo.object_detection_3d.backbones.monodetr_resnet import BackboneBuilder from otx.algo.object_detection_3d.detectors.monodetr import MonoDETR from otx.algo.object_detection_3d.heads.depth_predictor import DepthPredictor from otx.algo.object_detection_3d.heads.depthaware_transformer import DepthAwareTransformerBuilder from otx.algo.object_detection_3d.losses import MonoDETRCriterion -from otx.algo.object_detection_3d.utils.utils import box_cxcylrtb_to_xyxy -from otx.core.data.entity.base import OTXBatchLossEntity -from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity, Det3DBatchPredEntity from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.detection_3d import OTXObjectDetection3DExporter from otx.core.model.detection_3d import OTX3DDetectionModel @@ -30,7 +24,6 @@ class MonoDETR3D(OTX3DDetectionModel): mean: tuple[float, float, float] = (123.675, 116.28, 103.53) std: tuple[float, float, float] = (58.395, 57.12, 57.375) - input_size: tuple[int, int] = (384, 1280) # HxW load_from: str | None = None def _build_model(self, num_classes: int) -> MonoDETR: @@ -62,73 +55,6 @@ def _build_model(self, num_classes: int) -> MonoDETR: init_box=False, ) - def _customize_inputs( - self, - entity: Det3DBatchDataEntity, - ) -> dict[str, Any]: - # prepare bboxes for the model - targets_list = [] - img_sizes = torch.from_numpy(np.array([img_info.ori_shape for img_info in entity.imgs_info])).to( - device=entity.images.device, - ) - key_list = ["labels", "boxes", "depth", "size_3d", "heading_angle", "boxes_3d"] - for bz in range(len(entity.imgs_info)): - target_dict = {} - for key in key_list: - target_dict[key] = getattr(entity, key)[bz] - targets_list.append(target_dict) - - return { - "images": entity.images, - "calibs": torch.cat([p2.unsqueeze(0) for p2 in entity.calib_matrix], dim=0), - "targets": targets_list, - "img_sizes": img_sizes, - "mode": "loss" if self.training else "predict", - } - - def _customize_outputs( - self, - outputs: dict[str, torch.Tensor], - inputs: Det3DBatchDataEntity, - ) -> Det3DBatchPredEntity | OTXBatchLossEntity: - if self.training: - if not isinstance(outputs, dict): - raise TypeError(outputs) - - losses = OTXBatchLossEntity() - for k, v in outputs.items(): - if isinstance(v, list): - losses[k] = sum(v) - elif isinstance(v, Tensor): - losses[k] = v - else: - msg = "Loss output should be list or torch.tensor but got {type(v)}" - raise TypeError(msg) - return losses - - labels, scores, size_3d, heading_angle, boxes_3d, depth = self.extract_dets_from_outputs(outputs) - # bbox 2d decoding - boxes_2d = box_cxcylrtb_to_xyxy(boxes_3d) - xywh_2d = box_convert(boxes_2d, "xyxy", "cxcywh") - # size 2d decoding - size_2d = xywh_2d[:, :, 2:4] - - return Det3DBatchPredEntity( - batch_size=inputs.batch_size, - images=inputs.images, - imgs_info=inputs.imgs_info, - calib_matrix=inputs.calib_matrix, - boxes=boxes_2d, - labels=labels, - boxes_3d=boxes_3d, - size_2d=size_2d, - size_3d=size_3d, - depth=depth, - heading_angle=heading_angle, - scores=scores, - original_kitti_format=[None], - ) - def configure_optimizers(self) -> tuple[list[torch.optim.Optimizer], list[dict[str, Any]]]: """Configure an optimizer and learning-rate schedulers. diff --git a/src/otx/core/data/entity/object_detection_3d.py b/src/otx/core/data/entity/object_detection_3d.py index 564ea283a60..8be60c089fb 100644 --- a/src/otx/core/data/entity/object_detection_3d.py +++ b/src/otx/core/data/entity/object_detection_3d.py @@ -26,11 +26,18 @@ @register_pytree_node @dataclass class Det3DDataEntity(OTXDataEntity): - """Data entity for detection task. + """Data entity for 3d object detection task. + + : param boxes (tv_tensors.BoundingBoxes): The bounding boxes for the objects in the image. + : param calib_matrix (Tensor): The calibration matrix for the 3D object detection. + : param boxes_3d (Tensor): The 3D bounding boxes for the objects. + : param size_2d (Tensor): The 2D size of the objects. + : param size_3d (Tensor): The 3D size of the objects. + : param depth (Tensor): The depth of the objects. + : param heading_angle (Tensor): The heading angle of the objects. + : param labels (LongTensor): The labels of the objects. + : param original_kitti_format (list[dict[str, Any]] | None): The original KITTI format of the objects, if available. - :param bboxes: Bbox annotations as top-left-bottom-right - (x1, y1, x2, y2) format with absolute coordinate values - :param labels: Bbox labels as integer indices """ @property @@ -51,17 +58,24 @@ def task(self) -> OTXTaskType: @dataclass class Det3DPredEntity(OTXPredEntity, Det3DDataEntity): - """Data entity to represent the detection model output prediction.""" + """Data entity to represent the 3d object detection model output prediction.""" @dataclass class Det3DBatchDataEntity(OTXBatchDataEntity[Det3DDataEntity]): - """Data entity for detection task. - - :param bboxes: A list of bbox annotations as top-left-bottom-right - (x1, y1, x2, y2) format with absolute coordinate values - :param labels: A list of bbox labels as integer indices - """ # TODO(Kirill): UPDATE! + """Data entity for 3d object detection task. + + : param boxes list[tv_tensors.BoundingBoxes]: The bounding boxes for the objects in the image. + : param calib_matrix list[Tensor]: The calibration matrix for the 3D object detection. + : param boxes_3d list[Tensor]: The 3D bounding boxes for the objects. + : param size_2d list[Tensor]: The 2D size of the objects. + : param size_3d list[Tensor]: The 3D size of the objects. + : param depth list[Tensor]: The depth of the objects. + : param heading_angle list[Tensor]: The heading angle of the objects. + : param labels list[LongTensor]: The labels of the objects. + : param original_kitti_format list[list[dict[str, Any]] | None]: The original KITTI format of the objects, + if available. Needed for validation and KITTI metric. + """ images: Tensor boxes: list[tv_tensors.BoundingBoxes] @@ -135,7 +149,7 @@ def pin_memory(self) -> Det3DBatchDataEntity: @dataclass class Det3DBatchPredEntity(OTXBatchPredEntity, Det3DBatchDataEntity): - """Data entity to represent model output predictions for detection task.""" + """Data entity to represent model output predictions for 3d object detection task.""" boxes: tv_tensors.BoundingBoxes scores: Tensor diff --git a/src/otx/core/exporter/base.py b/src/otx/core/exporter/base.py index 85d77fe4799..cfbc670e58e 100644 --- a/src/otx/core/exporter/base.py +++ b/src/otx/core/exporter/base.py @@ -45,6 +45,9 @@ class OTXModelExporter: output_names (list[str] | None, optional): Names for model's outputs, which would be embedded into resulting model. Note, that order of the output names should be the same, as in the target model. + input_names (list[str] | None, optional): Names for model's inputs, which would be + embedded into resulting model. Note, that order of the input names should be the same, + as in the target model. """ def __init__( diff --git a/src/otx/core/metrics/average_precision_3d.py b/src/otx/core/metrics/average_precision_3d.py index 7b8530ba684..2600200280b 100644 --- a/src/otx/core/metrics/average_precision_3d.py +++ b/src/otx/core/metrics/average_precision_3d.py @@ -7,8 +7,10 @@ from typing import TYPE_CHECKING +import torch from torch import Tensor from torchmetrics import Metric +from torchmetrics.detection.mean_ap import MeanAveragePrecision from otx.core.metrics.kitti_3d_eval import get_coco_eval_result @@ -32,6 +34,7 @@ def __init__( super().__init__() self.label_info: LabelInfo = label_info + self.mean_ap: MeanAveragePrecision = MeanAveragePrecision(box_format="xyxy", iou_type="bbox") self.reset() def reset(self) -> None: @@ -42,6 +45,7 @@ def reset(self) -> None: super().reset() self.preds: list[dict[str, np.array]] = [] self.targets: list[dict[str, np.array]] = [] + self.mean_ap.reset() def update(self, preds: list[dict[str, Tensor]], target: list[dict[str, Tensor]]) -> None: """Update total predictions and targets from given batch predicitons and targets.""" @@ -51,13 +55,35 @@ def update(self, preds: list[dict[str, Tensor]], target: list[dict[str, Tensor]] def compute(self) -> dict: """Compute metrics for 3d object detection.""" current_classes = self.label_info.label_names - map_bbox, map_3d = get_coco_eval_result( + preds_for_torchmetrics = self.prepare_inputs_for_map_coco(self.preds) + targets_for_torchmetrics = self.prepare_inputs_for_map_coco(self.targets) + ap_bbox_coco = self.mean_ap(preds_for_torchmetrics, targets_for_torchmetrics) + ap_3d = get_coco_eval_result( self.targets, self.preds, current_classes=[curcls.lower() for curcls in current_classes], ) - # use moderate difficulty as final score. Average across all calsses. - return {"mAP_bbox_3d": Tensor([map_3d[:, 1].mean()]), "mAP_bbox_2d": Tensor([map_bbox[:, 1].mean()])} + # Average across all classes. + return { + "AP_3d@0.5": Tensor([ap_3d[0]]), + "AP_2d@0.5": ap_bbox_coco["map_50"], + "mAP_3d": Tensor([ap_3d.mean()]), + "mAP_2d": ap_bbox_coco["map"], + } + + def prepare_inputs_for_map_coco(self, targets: list[dict[str, np.array]]) -> list[dict[str, Tensor]]: + """Prepare targets for torchmetrics.""" + return [ + { + "boxes": torch.tensor(target["bbox"]), + "scores": torch.tensor(target["score"]) if "score" in target else None, + "labels": torch.tensor( + [self.label_info.label_names.index(label) for label in target["name"]], + dtype=torch.long, + ), + } + for target in targets + ] def _kitti_metric_measure_callable(label_info: LabelInfo) -> KittiMetric: diff --git a/src/otx/core/metrics/kitti_3d_eval/eval.py b/src/otx/core/metrics/kitti_3d_eval/eval.py index 86f634243a4..34144fa4797 100644 --- a/src/otx/core/metrics/kitti_3d_eval/eval.py +++ b/src/otx/core/metrics/kitti_3d_eval/eval.py @@ -6,6 +6,7 @@ from __future__ import annotations +import logging from typing import Any import numba @@ -18,44 +19,11 @@ from .rotate_iou import rotate_iou_eval_cpu as rotate_iou_eval -@numba.jit(nopython=True) -def get_thresholds( - scores: np.ndarray, # 1D array of confidence scores - num_gt: int, # Number of ground truth objects - num_sample_pts: int = 41, # Number of sample points used to compute recall thresholds -) -> np.ndarray: # 1D array of recall thresholds - """Compute recall thresholds for a given score array. - - Args: - scores (np.ndarray): 1D array of confidence scores. - num_gt (int): Number of ground truth objects. - num_sample_pts (int, optional): Number of sample points used to - compute recall thresholds. Defaults to 41. - - Returns: - np.ndarray: 1D array of recall thresholds. - """ - scores.sort() - scores = scores[::-1] - current_recall = 0.0 - thresholds = [] - for i, score in enumerate(scores): - l_recall = (i + 1) / num_gt - r_recall = (i + 2) / num_gt if i < len(scores) - 1 else l_recall - if ((r_recall - current_recall) < (current_recall - l_recall)) and (i < (len(scores) - 1)): - continue - # recall = l_recall - thresholds.append(score) - current_recall += 1 / (num_sample_pts - 1.0) - return thresholds - - def clean_data( gt_anno: dict, # ground truth annotations dt_anno: dict, # detection results current_class: str, # the current class name - difficulty: int, # the difficulty level -) -> tuple: # (num_valid_gt, ignored_gt, ignored_dt, dc_bboxes) +) -> tuple: # (num_valid_gt, ignored_gt, ignored_dt) """Filter out the objects that are not in the current class. Args: @@ -65,12 +33,12 @@ def clean_data( difficulty (int): The difficulty level. Returns: - tuple: The number of valid objects, ignored_gt, ignored_dt, and dc_bboxes. + tuple: The number of valid objects, ignored_gt, ignored_dt. """ - min_height = [40, 25, 25] - max_occlusion = [0, 1, 2] - max_truncation = [0.15, 0.3, 0.5] - dc_bboxes, ignored_gt, ignored_dt = [], [], [] + min_height = 20 + max_occlusion = 2 + max_truncation = 0.5 + ignored_gt, ignored_dt = [], [] num_gt = len(gt_anno["name"]) num_dt = len(dt_anno["name"]) num_valid_gt = 0 @@ -89,11 +57,10 @@ def clean_data( valid_class = -1 ignore = False if ( - (gt_anno["occluded"][i] > max_occlusion[difficulty]) - or (gt_anno["truncated"][i] > max_truncation[difficulty]) - or (height <= min_height[difficulty]) - ): - # if gt_anno["difficulty"][i] > difficulty or gt_anno["difficulty"][i] == -1: + (gt_anno["occluded"][i] > max_occlusion) + or (gt_anno["truncated"][i] > max_truncation) + or (height <= min_height) + ): # filter extrim cases ignore = True if valid_class == 1 and not ignore: ignored_gt.append(0) @@ -102,59 +69,18 @@ def clean_data( ignored_gt.append(1) else: ignored_gt.append(-1) - # for i in range(num_gt): - if gt_anno["name"][i] == "dontcare": - dc_bboxes.append(gt_anno["bbox"][i]) + for i in range(num_dt): valid_class = 1 if dt_anno["name"][i].lower() == current_class else -1 height = abs(dt_anno["bbox"][i, 3] - dt_anno["bbox"][i, 1]) - if height < min_height[difficulty]: + if height < min_height: ignored_dt.append(1) elif valid_class == 1: ignored_dt.append(0) else: ignored_dt.append(-1) - return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes - - -@numba.jit(nopython=True) -def image_box_overlap( - boxes: np.ndarray, # shape: (N, 4) - query_boxes: np.ndarray, # shape: (K, 4) - criterion: int = -1, # default overlap criterion: intersection over union -) -> np.ndarray: # shape: (N, K) - """Image box overlap. - - Args: - boxes (np.ndarray): shape: (N, 4), 2D boxes, (x1, y1, x2, y2) - query_boxes (np.ndarray): shape: (K, 4), 2D boxes, (x1, y1, x2, y2) - criterion (int, optional): overlap criterion, -1: intersection over union, - 0: intersection over box area, 1: intersection over query box area. Defaults to -1. - - Returns: - np.ndarray: shape: (N, K), overlap between boxes and query_boxes - """ - num_n = boxes.shape[0] - num_k = query_boxes.shape[0] - overlaps = np.zeros((num_n, num_k), dtype=boxes.dtype) - for k in range(num_k): - qbox_area = (query_boxes[k, 2] - query_boxes[k, 0]) * (query_boxes[k, 3] - query_boxes[k, 1]) - for n in range(num_n): - iw = min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0]) - if iw > 0: - ih = min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1]) - if ih > 0: - if criterion == -1: - ua = (boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1]) + qbox_area - iw * ih - elif criterion == 0: - ua = (boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1]) - elif criterion == 1: - ua = qbox_area - else: - ua = 1.0 - overlaps[n, k] = iw * ih / ua - return overlaps + return num_valid_gt, ignored_gt, ignored_dt @numba.jit(nopython=True) @@ -184,8 +110,6 @@ def d3_box_overlap_kernel( for i in range(n): for j in range(k): if rinc[i, j] > 0: - # iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] + - # qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1])) iw = min(boxes[i, 1], qboxes[j, 1]) - max(boxes[i, 1] - boxes[i, 4], qboxes[j, 1] - qboxes[j, 4]) if iw > 0: @@ -206,14 +130,12 @@ def d3_box_overlap_kernel( @numba.jit(nopython=True) -def compute_statistics_jit( # noqa: C901 +def compute_statistics_jit( overlaps: np.ndarray, # shape: (total_dt_num, total_gt_num) gt_datas: np.ndarray, # shape: (total_gt_num, 7) dt_datas: np.ndarray, # shape: (total_dt_num, 7) ignored_gt: list[int], # shape: (total_gt_num) ignored_det: list[int], # shape: (total_dt_num) - dc_bboxes: np.ndarray, # shape: (total_dc_num, 4) - metric: int, min_overlap: float, thresh: float = 0, compute_fp: bool = False, @@ -226,8 +148,6 @@ def compute_statistics_jit( # noqa: C901 dt_datas (np.ndarray): Detection data. ignored_gt (List[int]): Ignore ground truth indices. ignored_det (List[int]): Ignore detection indices. - dc_bboxes (np.ndarray): Don't care bboxes. - metric (int): Evaluation metric. min_overlap (float): Minimum overlap between dt and gt bboxes. thresh (float): Detection score threshold. Defaults to 0. compute_fp (bool): Whether to compute false positives. Defaults to False. @@ -238,17 +158,16 @@ def compute_statistics_jit( # noqa: C901 det_size = dt_datas.shape[0] gt_size = gt_datas.shape[0] dt_scores = dt_datas[:, -1] - dt_bboxes = dt_datas[:, :4] assigned_detection = [False] * det_size - ignored_threshold = [False] * det_size + ignored_obj_by_threshold = [False] * det_size if compute_fp: for i in range(det_size): if dt_scores[i] < thresh: - ignored_threshold[i] = True + ignored_obj_by_threshold[i] = True no_detection = -10000000 tp, fp, fn, similarity = 0, 0, 0, 0 - thresholds = np.zeros((gt_size,)) + tp_scores = np.zeros((gt_size,)) thresh_idx = 0 for i in range(gt_size): if ignored_gt[i] == -1: @@ -263,7 +182,7 @@ def compute_statistics_jit( # noqa: C901 continue if assigned_detection[j]: continue - if ignored_threshold[j]: + if ignored_obj_by_threshold[j]: continue overlap = overlaps[j, i] dt_score = dt_scores[j] @@ -291,32 +210,21 @@ def compute_statistics_jit( # noqa: C901 assigned_detection[det_idx] = True elif valid_detection != no_detection: tp += 1 - # thresholds.append(dt_scores[det_idx]) - thresholds[thresh_idx] = dt_scores[det_idx] + + tp_scores[thresh_idx] = dt_scores[det_idx] thresh_idx += 1 assigned_detection[det_idx] = True if compute_fp: for i in range(det_size): - if not (assigned_detection[i] or ignored_det[i] == -1 or ignored_det[i] == 1 or ignored_threshold[i]): + if not ( + assigned_detection[i] or ignored_det[i] == -1 or ignored_det[i] == 1 or ignored_obj_by_threshold[i] + ): fp += 1 nstuff = 0 - if metric == 0: - overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0) - for i in range(dc_bboxes.shape[0]): - for j in range(det_size): - if assigned_detection[j]: - continue - if ignored_det[j] == -1 or ignored_det[j] == 1: - continue - if ignored_threshold[j]: - continue - if overlaps_dt_dc[j, i] > min_overlap: - assigned_detection[j] = True - nstuff += 1 fp -= nstuff - return tp, fp, fn, similarity, thresholds[:thresh_idx] + return tp, fp, fn, similarity, tp_scores[:thresh_idx] @numba.jit(nopython=True) @@ -346,13 +254,10 @@ def fused_compute_statistics( pr: np.ndarray, # shape: (num_thresholds, 4) gt_nums: np.ndarray, # shape: (num_samples) dt_nums: np.ndarray, # shape: (num_samples) - dc_nums: np.ndarray, # shape: (num_samples) gt_datas: np.ndarray, # shape: (total_gt_num, 7) dt_datas: np.ndarray, # shape: (total_dt_num, 7) - dontcares: np.ndarray, # shape: (total_dc_num, 4) ignored_gts: np.ndarray, # shape: (total_gt_num) ignored_dets: np.ndarray, # shape: (total_dt_num) - metric: int, min_overlap: float, thresholds: np.ndarray, # shape: (num_thresholds) ) -> None: @@ -371,26 +276,20 @@ def fused_compute_statistics( gt_nums[i] is the number of ground truths in i-th sample dt_nums (np.ndarray): 1D array of shape (num_samples), dt_nums[i] is the number of detections in i-th sample - dc_nums (np.ndarray): 1D array of shape (num_samples), - dc_nums[i] is the number of dontcare areas in i-th sample gt_datas (np.ndarray): 2D array of shape (total_gt_num, 7), gt_datas[i] is the i-th ground truth box dt_datas (np.ndarray): 2D array of shape (total_dt_num, 7), dt_datas[i] is the i-th detection box - dontcares (np.ndarray): 2D array of shape (total_dc_num, 4), - dontcares[i] is the i-th dontcare area ignored_gts (np.ndarray): 1D array of shape (total_gt_num), ignored_gts[i] is 1 if the i-th ground truth is ignored, 0 otherwise ignored_dets (np.ndarray): 1D array of shape (total_dt_num), ignored_dets[i] is 1 if the i-th detection is ignored, 0 otherwise - metric (int): Eval type. 0: bbox, 1: bev, 2: 3d min_overlap (float): Min overlap thresholds (np.ndarray): 1D array of shape (num_thresholds), thresholds[i] is the i-th threshold """ gt_num = 0 dt_num = 0 - dc_num = 0 for i in range(gt_nums.shape[0]): for t, thresh in enumerate(thresholds): overlap = overlaps[dt_num : dt_num + dt_nums[i], gt_num : gt_num + gt_nums[i]] @@ -398,15 +297,12 @@ def fused_compute_statistics( dt_data = dt_datas[dt_num : dt_num + dt_nums[i]] ignored_gt = ignored_gts[gt_num : gt_num + gt_nums[i]] ignored_det = ignored_dets[dt_num : dt_num + dt_nums[i]] - dontcare = dontcares[dc_num : dc_num + dc_nums[i]] tp, fp, fn, similarity, _ = compute_statistics_jit( overlap, gt_data, dt_data, ignored_gt, ignored_det, - dontcare, - metric, min_overlap=min_overlap, thresh=thresh, compute_fp=True, @@ -418,13 +314,11 @@ def fused_compute_statistics( pr[t, 3] += similarity gt_num += gt_nums[i] dt_num += dt_nums[i] - dc_num += dc_nums[i] def calculate_iou_partly( gt_annos: list[dict[str, Any]], dt_annos: list[dict[str, Any]], - metric: int, num_parts: int = 50, ) -> tuple[list[np.ndarray], list[np.ndarray], np.ndarray, np.ndarray]: """Fast iou algorithm. @@ -435,7 +329,6 @@ def calculate_iou_partly( Args: gt_annos: List of dict, must from get_label_annos() in kitti_common.py dt_annos: List of dict, must from get_label_annos() in kitti_common.py - metric: Eval type. 0: bbox, 1: bev, 2: 3d num_parts: Int, a parameter for fast calculate algorithm Returns: @@ -478,23 +371,17 @@ def d3_box_overlap(boxes: np.ndarray, qboxes: np.ndarray, criterion: int = -1) - for num_part in split_parts: gt_annos_part = gt_annos[example_idx : example_idx + num_part] dt_annos_part = dt_annos[example_idx : example_idx + num_part] - if metric == 0: - gt_boxes = np.concatenate([a["bbox"] for a in gt_annos_part], 0) - dt_boxes = np.concatenate([a["bbox"] for a in dt_annos_part], 0) - overlap_part = image_box_overlap(gt_boxes, dt_boxes) - elif metric == 2: - loc = np.concatenate([a["location"] for a in gt_annos_part], 0) - dims = np.concatenate([a["dimensions"] for a in gt_annos_part], 0) - rots = np.concatenate([a["rotation_y"] for a in gt_annos_part], 0) - gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) - loc = np.concatenate([a["location"] for a in dt_annos_part], 0) - dims = np.concatenate([a["dimensions"] for a in dt_annos_part], 0) - rots = np.concatenate([a["rotation_y"] for a in dt_annos_part], 0) - dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) - overlap_part = d3_box_overlap(gt_boxes, dt_boxes).astype(np.float64) - else: - msg = "unknown metric" - raise ValueError(msg) + + loc = np.concatenate([a["location"] for a in gt_annos_part], 0) + dims = np.concatenate([a["dimensions"] for a in gt_annos_part], 0) + rots = np.concatenate([a["rotation_y"] for a in gt_annos_part], 0) + gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) + loc = np.concatenate([a["location"] for a in dt_annos_part], 0) + dims = np.concatenate([a["dimensions"] for a in dt_annos_part], 0) + rots = np.concatenate([a["rotation_y"] for a in dt_annos_part], 0) + dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) + overlap_part = d3_box_overlap(gt_boxes, dt_boxes).astype(np.float64) + parted_overlaps.append(overlap_part) example_idx += num_part overlaps = [] @@ -520,38 +407,30 @@ def _prepare_data( gt_annos: list[dict[str, Any]], dt_annos: list[dict[str, Any]], current_class: str, - difficulty: int, -) -> tuple[list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray], np.ndarray, int]: +) -> tuple[list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray], int]: """Prepare data for evaluation. Args: gt_annos (List[Dict[str, Any]]): Ground truth annotations. dt_annos (List[Dict[str, Any]]): Detection annotations. current_class (str): Current class name. - difficulty (int): Difficulty level. Returns: Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray], - List[np.ndarray], List[np.ndarray], np.ndarray, int]: + List[np.ndarray], int]: gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, - dontcares, total_dc_num, total_num_valid_gt + total_num_valid_gt """ gt_datas_list = [] dt_datas_list = [] - total_dc_num = [] - ignored_gts, ignored_dets, dontcares = [], [], [] + ignored_gts, ignored_dets = [], [] total_num_valid_gt = 0 for i in range(len(gt_annos)): - rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty) - num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets + rets = clean_data(gt_annos[i], dt_annos[i], current_class) + num_valid_gt, ignored_gt, ignored_det = rets ignored_gts.append(np.array(ignored_gt, dtype=np.int64)) ignored_dets.append(np.array(ignored_det, dtype=np.int64)) - if len(dc_bboxes) == 0: - dc_bboxes = np.zeros((0, 4)).astype(np.float64) - else: - dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64) - total_dc_num.append(dc_bboxes.shape[0]) - dontcares.append(dc_bboxes) + total_num_valid_gt += num_valid_gt gt_datas = np.concatenate([gt_annos[i]["bbox"], gt_annos[i]["alpha"][..., np.newaxis]], 1) dt_datas = np.concatenate( @@ -564,112 +443,94 @@ def _prepare_data( ) gt_datas_list.append(gt_datas) dt_datas_list.append(dt_datas) - total_dc_num = np.stack(total_dc_num, axis=0) - return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, total_dc_num, total_num_valid_gt) + + return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, total_num_valid_gt) def eval_class( gt_annos: list[dict[str, Any]], dt_annos: list[dict[str, Any]], current_classes: list[str], - difficultys: list[int], - metric: int, min_overlaps: np.ndarray, num_parts: int = 50, + num_samples_pts: int = 41, ) -> dict[str, np.ndarray]: """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP. Args: - gt_annos: dict, must from get_label_annos() in kitti_common.py - dt_annos: dict, must from get_label_annos() in kitti_common.py - current_classes: list of label names - difficultys: list of int. eval difficulty, 0: easy, 1: normal, 2: hard - metric: eval type. 0: bbox, 1: bev, 2: 3d - min_overlaps: float, min overlap. format: [num_overlap, metric, class]. - num_parts: int. a parameter for fast calculate algorithm + gt_annos (dict): must from get_label_annos() in kitti_common.py + dt_annos (dict): must from get_label_annos() in kitti_common.py + current_classes (list): label names + min_overlaps (float): min overlap. format: [num_overlap, class]. + num_parts (int): a parameter for fast calculate algorithm + num_samples_pts (int): number of points for precision-recall curve Returns: - dict of recall, precision and aos + dict of recall, precision """ num_examples = len(gt_annos) split_parts = get_split_parts(num_examples, num_parts) - part_calculated = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts) - overlaps, parted_overlaps, total_dt_num, total_gt_num = part_calculated - num_samples_pts = 41 # TODO(Kirill): why it is 41? - # The validation with 1-40 examples are not possible corecctly + rets = calculate_iou_partly(dt_annos, gt_annos, num_parts) + overlaps, parted_overlaps, total_dt_num, total_gt_num = rets num_minoverlap = len(min_overlaps) num_class = len(current_classes) - num_difficulty = len(difficultys) - precision = np.zeros([num_class, num_difficulty, num_minoverlap, num_samples_pts]) - recall = np.zeros([num_class, num_difficulty, num_minoverlap, num_samples_pts]) - aos = np.zeros([num_class, num_difficulty, num_minoverlap, num_samples_pts]) + precision = np.zeros([num_class, num_minoverlap, num_samples_pts]) + recall = np.zeros([num_class, num_minoverlap, num_samples_pts]) for m, current_class in enumerate(current_classes): - for d, difficulty in enumerate(difficultys): - ( - gt_datas_list, - dt_datas_list, - ignored_gts, - ignored_dets, - dontcares, - total_dc_num, - total_num_valid_gt, - ) = _prepare_data(gt_annos, dt_annos, current_class, difficulty) - for k, min_overlap in enumerate(min_overlaps[:, metric, m]): - thresholdss = [] - for i in range(len(gt_annos)): - tp, fp, fn, similarity, thresholds = compute_statistics_jit( - overlaps[i], - gt_datas_list[i], - dt_datas_list[i], - ignored_gts[i], - ignored_dets[i], - dontcares[i], - metric, - min_overlap=min_overlap, - thresh=0.0, - compute_fp=False, - ) - thresholdss += thresholds.tolist() - thresholdss = np.array(thresholdss) - thresholds = get_thresholds(thresholdss, total_num_valid_gt) - thresholds = np.array(thresholds) - pr = np.zeros([len(thresholds), 4]) - idx = 0 - for j, num_part in enumerate(split_parts): - gt_datas_part = np.concatenate(gt_datas_list[idx : idx + num_part], 0) - dt_datas_part = np.concatenate(dt_datas_list[idx : idx + num_part], 0) - dc_datas_part = np.concatenate(dontcares[idx : idx + num_part], 0) - ignored_dets_part = np.concatenate(ignored_dets[idx : idx + num_part], 0) - ignored_gts_part = np.concatenate(ignored_gts[idx : idx + num_part], 0) - fused_compute_statistics( - parted_overlaps[j], - pr, - total_gt_num[idx : idx + num_part], - total_dt_num[idx : idx + num_part], - total_dc_num[idx : idx + num_part], - gt_datas_part, - dt_datas_part, - dc_datas_part, - ignored_gts_part, - ignored_dets_part, - metric, - min_overlap=min_overlap, - thresholds=thresholds, - ) - idx += num_part - for i in range(len(thresholds)): - recall[m, d, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2]) - precision[m, d, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 1]) - - for i in range(len(thresholds)): - precision[m, d, k, i] = np.max(precision[m, d, k, i:], axis=-1) - recall[m, d, k, i] = np.max(recall[m, d, k, i:], axis=-1) + ( + gt_datas_list, + dt_datas_list, + ignored_gts, + ignored_dets, + total_num_valid_gt, + ) = _prepare_data(gt_annos, dt_annos, current_class) + for k, min_overlap in enumerate(min_overlaps[:, m]): + thresholdss = [] + for i in range(len(gt_annos)): + tp, fp, fn, similarity, thresholds = compute_statistics_jit( + overlaps[i], + gt_datas_list[i], + dt_datas_list[i], + ignored_gts[i], + ignored_dets[i], + min_overlap=min_overlap, + thresh=0.0, + compute_fp=False, + ) + thresholdss += thresholds.tolist() + if not thresholdss: + continue # no tp -> 0 precision and recall + # create thresholds between 0 and the max threshold, len(thresholds) == num_samples_pts + thresholds = np.linspace(0.0, np.max(thresholdss), num_samples_pts) + pr = np.zeros([len(thresholds), 4]) + idx = 0 + for j, num_part in enumerate(split_parts): + gt_datas_part = np.concatenate(gt_datas_list[idx : idx + num_part], 0) + dt_datas_part = np.concatenate(dt_datas_list[idx : idx + num_part], 0) + ignored_dets_part = np.concatenate(ignored_dets[idx : idx + num_part], 0) + ignored_gts_part = np.concatenate(ignored_gts[idx : idx + num_part], 0) + fused_compute_statistics( + parted_overlaps[j], + pr, + total_gt_num[idx : idx + num_part], + total_dt_num[idx : idx + num_part], + gt_datas_part, + dt_datas_part, + ignored_gts_part, + ignored_dets_part, + min_overlap=min_overlap, + thresholds=thresholds, + ) + idx += num_part + + for i in range(len(thresholds)): + recall[m, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2]) + precision[m, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 1]) return { "recall": recall, "precision": precision, - "orientation": aos, } @@ -678,7 +539,7 @@ def do_eval_cut_version( dt_annos: list[dict[str, Any]], current_classes: list[str], min_overlaps: np.ndarray, -) -> tuple[np.ndarray, np.ndarray]: +) -> np.ndarray: """Evaluates detections with COCO style AP. Args: @@ -688,34 +549,19 @@ def do_eval_cut_version( min_overlaps (np.ndarray): Overlap ranges. Returns: - Tuple[float, float]: Bounding box and 3D bounding box AP. + np.ndarray: 3D bounding box AP. """ - - def _get_map(prec: np.ndarray) -> np.ndarray: - sums = 0 - for i in range(0, prec.shape[-1], 4): - sums = sums + prec[..., i] - return sums / 11 * 100 - - # min_overlaps: [num_minoverlap, metric, num_class] - difficultys = [0, 1, 2] - ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 0, min_overlaps) - # ret: [num_class, num_diff, num_minoverlap, num_sample_points] - # get 2d bbox map - map_bbox = _get_map(ret["precision"]) - - # get 3d bbox map - ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2, min_overlaps) - map_3d = _get_map(ret["precision"]) - - return map_bbox, map_3d + # min_overlaps: [num_minoverlap, num_class] + # get 3D bbox mAP + ret = eval_class(gt_annos, dt_annos, current_classes, min_overlaps) + return np.mean(ret["precision"], axis=2) def get_coco_eval_result( gt_annos: list[dict], dt_annos: list[dict], current_classes: list[str], -) -> tuple[np.ndarray, np.ndarray]: +) -> np.ndarray: """Evaluates detections with COCO style AP. Args: @@ -724,7 +570,7 @@ def get_coco_eval_result( current_classes (list[str]): Classes to evaluate. Returns: - Tuple[np.ndarray, np.ndarray]: Bounding box and 3D bounding box AP. + np.ndarray: 3D bounding box AP. """ def do_coco_style_eval( @@ -732,7 +578,7 @@ def do_coco_style_eval( dt_annos: list[dict], current_classes: list[str], overlap_ranges: np.ndarray, - ) -> tuple[np.ndarray, np.ndarray]: + ) -> np.ndarray: """Evaluates detections with COCO style AP. Args: @@ -742,39 +588,33 @@ def do_coco_style_eval( overlap_ranges (np.ndarray): Overlap ranges. Returns: - Tuple[np.ndarray, np.ndarray]: Bounding box and 3D bounding box AP. + np.ndarray: 3D bounding box AP. """ min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]]) for i in range(overlap_ranges.shape[1]): - for j in range(overlap_ranges.shape[2]): - min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j][:2], 10) + min_overlaps[:, i] = np.linspace(*overlap_ranges[:, i], 10) + + map_3d = do_eval_cut_version(gt_annos, dt_annos, current_classes, min_overlaps) - map_bbox, map_3d = do_eval_cut_version(gt_annos, dt_annos, current_classes, min_overlaps) + result_str = "" - return map_bbox.mean(-1), map_3d.mean(-1) + for i, lbl in enumerate(current_classes): + result_str += f"\nclass: {lbl}\n" + "-" * len(f"class: {lbl}") + "\n" + for j, overlap in enumerate(min_overlaps): + result_str += f"AP@IoU={np.round(overlap[i],2)}: {np.round(map_3d[i][j] * 100, 2)}\n" + result_str += "\n" + logging.log(msg=result_str, level=logging.INFO) - iou_range = [0.5, 0.95, 10] + return map_3d.mean(0) + + iou_range = [0.5, 0.95] if not isinstance(current_classes, (list, tuple)): current_classes = [current_classes] - overlap_ranges = np.zeros([3, 3, len(current_classes)]) + overlap_ranges = np.zeros([2, len(current_classes)]) for i in range(len(current_classes)): # iou from 0.5 to 0.95 - overlap_ranges[:, :, i] = np.array(iou_range)[:, np.newaxis] - result = "" - # check whether alpha is valid - map_bbox, map_3d = do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges) - - for j, curcls in enumerate(current_classes): - # map threshold array: [num_minoverlap, metric, class] - # map result: [num_class, num_diff, num_minoverlap] - o_range = np.array(iou_range)[[0, 2, 1]] - o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1) - result += f"{curcls} " + "coco AP@{:.2f}:{:.2f}:{:.2f}:\n".format(*o_range) - result += f"bbox AP:{map_bbox[j, 0]:.2f}, {map_bbox[j, 1]:.2f}, {map_bbox[j, 2]:.2f}\n" - result += f"3d AP:{map_3d[j, 0]:.2f}, {map_3d[j, 1]:.2f}, {map_3d[j, 2]:.2f}\n" - - print("\n COCO style evaluation results: \n", result) - - return map_bbox, map_3d + overlap_ranges[:, i] = np.array(iou_range) + + return do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges) diff --git a/src/otx/core/model/detection_3d.py b/src/otx/core/model/detection_3d.py index fce245e79ac..9c6cacda6c5 100644 --- a/src/otx/core/model/detection_3d.py +++ b/src/otx/core/model/detection_3d.py @@ -80,6 +80,73 @@ def _export_parameters(self) -> TaskLevelExportParameters: task_type="3d_detection", ) + def _customize_inputs( + self, + entity: Det3DBatchDataEntity, + ) -> dict[str, Any]: + # prepare bboxes for the model + targets_list = [] + img_sizes = torch.from_numpy(np.array([img_info.ori_shape for img_info in entity.imgs_info])).to( + device=entity.images.device, + ) + key_list = ["labels", "boxes", "depth", "size_3d", "heading_angle", "boxes_3d"] + for bz in range(len(entity.imgs_info)): + target_dict = {} + for key in key_list: + target_dict[key] = getattr(entity, key)[bz] + targets_list.append(target_dict) + + return { + "images": entity.images, + "calibs": torch.cat([p2.unsqueeze(0) for p2 in entity.calib_matrix], dim=0), + "targets": targets_list, + "img_sizes": img_sizes, + "mode": "loss" if self.training else "predict", + } + + def _customize_outputs( + self, + outputs: dict[str, torch.Tensor], + inputs: Det3DBatchDataEntity, + ) -> Det3DBatchPredEntity | OTXBatchLossEntity: + if self.training: + if not isinstance(outputs, dict): + raise TypeError(outputs) + + losses = OTXBatchLossEntity() + for k, v in outputs.items(): + if isinstance(v, list): + losses[k] = sum(v) + elif isinstance(v, torch.Tensor): + losses[k] = v + else: + msg = "Loss output should be list or torch.tensor but got {type(v)}" + raise TypeError(msg) + return losses + + labels, scores, size_3d, heading_angle, boxes_3d, depth = self.extract_dets_from_outputs(outputs) + # bbox 2d decoding + boxes_2d = box_cxcylrtb_to_xyxy(boxes_3d) + xywh_2d = box_convert(boxes_2d, "xyxy", "cxcywh") + # size 2d decoding + size_2d = xywh_2d[:, :, 2:4] + + return Det3DBatchPredEntity( + batch_size=inputs.batch_size, + images=inputs.images, + imgs_info=inputs.imgs_info, + calib_matrix=inputs.calib_matrix, + boxes=boxes_2d, + labels=labels, + boxes_3d=boxes_3d, + size_2d=size_2d, + size_3d=size_3d, + depth=depth, + heading_angle=heading_angle, + scores=scores, + original_kitti_format=[None], + ) + def _convert_pred_entity_to_compute_metric( self, preds: Det3DBatchPredEntity, diff --git a/src/otx/recipe/_base_/data/object_detection_3d.yaml b/src/otx/recipe/_base_/data/object_detection_3d.yaml index a7c773f1bcf..708c73b6750 100644 --- a/src/otx/recipe/_base_/data/object_detection_3d.yaml +++ b/src/otx/recipe/_base_/data/object_detection_3d.yaml @@ -12,7 +12,7 @@ train_subset: subset_name: train transform_lib_type: TORCHVISION batch_size: 8 - num_workers: 4 + num_workers: 2 to_tv_image: false transforms: - class_path: torchvision.transforms.v2.Normalize @@ -27,7 +27,7 @@ val_subset: subset_name: val transform_lib_type: TORCHVISION batch_size: 16 - num_workers: 4 + num_workers: 2 to_tv_image: false transforms: - class_path: torchvision.transforms.v2.Normalize @@ -41,7 +41,7 @@ test_subset: subset_name: test transform_lib_type: TORCHVISION batch_size: 16 - num_workers: 4 + num_workers: 2 to_tv_image: false transforms: - class_path: torchvision.transforms.v2.Normalize diff --git a/src/otx/recipe/object_detection_3d/monodetr3d.yaml b/src/otx/recipe/object_detection_3d/monodetr3d.yaml index 032c71ffbf8..ec5aaa005eb 100644 --- a/src/otx/recipe/object_detection_3d/monodetr3d.yaml +++ b/src/otx/recipe/object_detection_3d/monodetr3d.yaml @@ -20,13 +20,13 @@ model: mode: max factor: 0.1 patience: 13 - monitor: val/mAP_bbox_2d + monitor: val/AP_2d@0.5 engine: task: OBJECT_DETECTION_3D device: auto -callback_monitor: val/mAP_bbox_3d +callback_monitor: val/AP_3d@0.5 data: ../_base_/data/object_detection_3d.yaml diff --git a/tests/integration/cli/test_export_inference.py b/tests/integration/cli/test_export_inference.py index 2e556210165..1d455616c4f 100644 --- a/tests/integration/cli/test_export_inference.py +++ b/tests/integration/cli/test_export_inference.py @@ -49,7 +49,7 @@ def fxt_local_seed() -> int: "zero_shot_visual_prompting": "test/f1-score", "action_classification": "test/accuracy", "keypoint_detection": "test/PCK", - "object_detection_3d": "test/mAP_bbox_3d", + "object_detection_3d": "test/AP_3d@0.5", } diff --git a/tests/perf/test_object_detection_3d.py b/tests/perf/test_object_detection_3d.py index 2cf247843c9..2fae45c8221 100644 --- a/tests/perf/test_object_detection_3d.py +++ b/tests/perf/test_object_detection_3d.py @@ -40,14 +40,14 @@ class TestPerfObjectDetection3D(PerfTestBase): BENCHMARK_CRITERIA = [ # noqa: RUF012 Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1), Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1), - Benchmark.Criterion(name="val/mAP_bbox_3d", summary="max", compare=">", margin=0.05), - Benchmark.Criterion(name="val/mAP_bbox_2d", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="test/mAP_bbox_3d", summary="max", compare=">", margin=0.05), - Benchmark.Criterion(name="test/mAP_bbox_2d", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="export/mAP_bbox_3d", summary="max", compare=">", margin=0.05), - Benchmark.Criterion(name="export/mAP_bbox_2d", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="optimize/mAP_bbox_3d", summary="max", compare=">", margin=0.05), - Benchmark.Criterion(name="optimize/mAP_bbox_2d", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="val/AP_3d@0.5", summary="max", compare=">", margin=0.05), + Benchmark.Criterion(name="val/AP_2d@0.5", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test/AP_3d@0.5", summary="max", compare=">", margin=0.05), + Benchmark.Criterion(name="test/AP_2d@0.5", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/AP_3d@0.5", summary="max", compare=">", margin=0.05), + Benchmark.Criterion(name="export/AP_2d@0.5", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/AP_3d@0.5", summary="max", compare=">", margin=0.05), + Benchmark.Criterion(name="optimize/AP_2d@0.5", summary="max", compare=">", margin=0.1), Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),