Make mAP 3D more general (#4031)

* added coco metric * fix linter * added ap_05 * fix perf test * small fix * fix some misc comments from previous PRs * reply comments
openvinotoolkit · Oct 18, 2024 · e92cdfd · e92cdfd
1 parent fe32690
commit e92cdfd
Show file tree

Hide file tree

Showing 16 changed files with 403 additions and 459 deletions.
diff --git a/src/otx/algo/detection/heads/rtdetr_decoder.py b/src/otx/algo/detection/heads/rtdetr_decoder.py
@@ -236,8 +236,7 @@ def forward(
         value = self.value_proj(value)
         if value_mask is not None:
             value = value.masked_fill(value_mask[..., None], float(0))
-            # value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
-            # value3 = value * value_mask.unsqueeze(-1)
+
         value = value.reshape(bs, len_v, self.num_heads, self.head_dim)
 
         sampling_offsets = self.sampling_offsets(query).reshape(

diff --git a/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py b/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py
@@ -33,7 +33,7 @@ def __init__(
             num_pos_feats (int): Number of positional features.
             temperature (int): Temperature scaling factor.
             normalize (bool): Flag indicating whether to normalize the position embeddings.
-            scale (Optional[float]): Scaling factor for the position embeddings. If None, default value is used.
+            scale (float | None): Scaling factor for the position embeddings. If None, default value is used.
         """
         super().__init__()
         self.num_pos_feats = num_pos_feats
@@ -132,7 +132,7 @@ def __init__(
 
         Args:
             backbone (nn.Module): The backbone module.
-            position_embedding (Union[PositionEmbeddingSine]): The position embedding module.
+            position_embedding (PositionEmbeddingSine): The position embedding module.
         """
         super().__init__(backbone, position_embedding)
         self.strides = backbone.strides

diff --git a/src/otx/algo/object_detection_3d/detectors/monodetr.py b/src/otx/algo/object_detection_3d/detectors/monodetr.py
@@ -149,12 +149,17 @@ def forward(
         """Forward method of the MonoDETR model.
 
         Args:
-            images (list[Tensor]): images for each sample
-            calibs (Tensor): camera matrices for each sample
-            img_sizes (Tensor): image sizes for each sample
-            targets (list[dict[Tensor]): ground truth boxes and labels for each
-                sample
+            images (Tensor): images for each sample.
+            calibs (Tensor): camera matrices for each sample.
+            img_sizes (Tensor): image sizes for each sample.
+            targets (list[dict[str, Tensor]): ground truth boxes and labels for each
+                sample. Defaults to None.
             mode (str): The mode of operation. Defaults to "predict".
+
+        Returns:
+                dict[str, Tensor]: A dictionary of tensors. If mode is "loss", the
+                tensors are the loss values. If mode is "predict", the tensors are
+                the logits.
         """
         features, pos = self.backbone(images)
 

diff --git a/src/otx/algo/object_detection_3d/heads/depth_predictor.py b/src/otx/algo/object_detection_3d/heads/depth_predictor.py
@@ -32,6 +32,8 @@ def __init__(
             depth_min (float): The minimum depth value.
             depth_max (float): The maximum depth value.
             hidden_dim (int): The dimension of the hidden layer.
+            activation (Callable[..., nn.Module], optional): The activation function.
+                Defaults to nn.ReLU.
         """
         super().__init__()
         self.depth_max = depth_max

diff --git a/src/otx/algo/object_detection_3d/losses/ddn_loss.py b/src/otx/algo/object_detection_3d/losses/ddn_loss.py
@@ -22,13 +22,13 @@ def compute_fg_mask(
     """Compute foreground mask for images.
 
     Args:
-        gt_boxes2d [torch.Tensor(B, N, 4)]: 2D box labels
-        shape [Tuple[int, int]]: Foreground mask desired shape
-        downsample_factor [int]: Downsample factor for image
-        device [torch.device]: Foreground mask desired device
+        gt_boxes2d (torch.Tensor): 2D box labels.
+        shape (Tuple[int, int]): Foreground mask desired shape.
+        downsample_factor (int): Downsample factor for image.
+        device (torch.device): Foreground mask desired device.
 
     Returns:
-        fg_mask [torch.Tensor(shape)]: Foreground mask
+        fg_mask (torch.Tensor(shape)]: Foreground mask.
     """
     if device is None:
         device = torch.device("cpu")
@@ -58,9 +58,9 @@ def __init__(self, fg_weight: float, bg_weight: float, downsample_factor: int =
         """Initialize fixed foreground/background loss balancer.
 
         Args:
-            fg_weight [float]: Foreground loss weight
-            bg_weight [float]: Background loss weight
-            downsample_factor [int]: Depth map downsample factor
+            fg_weight (float): Foreground loss weight.
+            bg_weight (float): Background loss weight.
+            downsample_factor (int): Depth map downsample factor.
         """
         super().__init__()
         self.fg_weight = fg_weight
@@ -76,12 +76,11 @@ def forward(
         """Forward pass.
 
         Args:
-            loss [torch.Tensor(B, H, W)]: Pixel-wise loss
-            gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing
+            loss (torch.Tensor): Pixel-wise loss.
+            gt_boxes2d (torch.Tensor): 2D box labels for foreground/background balancing.
 
         Returns:
-            loss [torch.Tensor(1)]: Total loss after foreground/background balancing
-            tb_dict [dict[float]]: All losses to log in tensorboard
+            loss (torch.Tensor): Total loss after foreground/background balancing.
         """
         # Compute masks
         fg_mask = compute_fg_mask(
@@ -120,13 +119,11 @@ def __init__(
         """Initializes DDNLoss module.
 
         Args:
-            weight [float]: Loss function weight
-            alpha [float]: Alpha value for Focal Loss
-            gamma [float]: Gamma value for Focal Loss
-            disc_cfg [dict]: Depth discretiziation configuration
-            fg_weight [float]: Foreground loss weight
-            bg_weight [float]: Background loss weight
-            downsample_factor [int]: Depth map downsample factor
+            alpha (float): Alpha value for Focal Loss.
+            gamma (float): Gamma value for Focal Loss.
+            fg_weight (float): Foreground loss weight.
+            bg_weight (float): Background loss weight.
+            downsample_factor (int): Depth map downsample factor.
         """
         super().__init__()
         self.balancer = Balancer(downsample_factor=downsample_factor, fg_weight=fg_weight, bg_weight=bg_weight)
@@ -146,10 +143,10 @@ def build_target_depth_from_3dcenter(
         """Builds target depth map from 3D center depth.
 
         Args:
-            depth_logits: torch.Tensor(B, D+1, H, W)]: Predicted depth logits
-            gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing
-            gt_center_depth [torch.Tensor(B, N)]: 3D center depth
-            num_gt_per_img: [int]: Number of ground truth boxes per image
+            depth_logits: (torch.Tensor): Predicted depth logits.
+            gt_boxes2d (torch.Tensor)): 2D box labels for foreground/background balancing.
+            gt_center_depth (torch.Tensor): 3D center depth.
+            num_gt_per_img: (int): Number of ground truth boxes per image.
         """
         b, _, h, w = depth_logits.shape
         depth_maps = torch.zeros((b, h, w), device=depth_logits.device, dtype=depth_logits.dtype)
@@ -185,18 +182,18 @@ def bin_depths(
         """Converts depth map into bin indices.
 
         Args:
-            depth_map [torch.Tensor(H, W)]: Depth Map
-            mode [string]: Discretiziation mode (See https://arxiv.org/pdf/2005.13423.pdf for more details)
-                UD: Uniform discretiziation
-                LID: Linear increasing discretiziation
-                SID: Spacing increasing discretiziation
-            depth_min [float]: Minimum depth value
-            depth_max [float]: Maximum depth value
-            num_bins [int]: Number of depth bins
-            target [bool]: Whether the depth bins indices will be used for a target tensor in loss comparison
+            depth_map (torch.Tensor): Depth Map.
+            mode (string): Discretiziation mode (See https://arxiv.org/pdf/2005.13423.pdf for more details).
+                UD: Uniform discretiziation.
+                LID: Linear increasing discretiziation.
+                SID: Spacing increasing discretiziation.
+            depth_min (float): Minimum depth value.
+            depth_max (float): Maximum depth value.
+            num_bins (int): Number of depth bins.
+            target (bool): Whether the depth bins indices will be used for a target tensor in loss comparison.
 
         Returns:
-            indices [torch.Tensor(H, W)]: Depth bin indices
+            indices (torch.Tensor): Depth bin indices.
         """
         if mode == "UD":
             bin_size = (depth_max - depth_min) / num_bins
@@ -233,13 +230,13 @@ def forward(
         """Gets depth_map loss.
 
         Args:
-            depth_logits: torch.Tensor(B, D+1, H, W)]: Predicted depth logits
-            gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing
-            num_gt_per_img: [int]: Number of ground truth boxes per image
-            gt_center_depth: [torch.Tensor(B, N)]: 3D center depth
+            depth_logits: (torch.Tensor): Predicted depth logits.
+            gt_boxes2d (torch.Tensor): 2D box labels for foreground/background balancing.
+            num_gt_per_img: (int): Number of ground truth boxes per image.
+            gt_center_depth: (torch.Tensor): 3D center depth.
 
         Returns:
-            loss [torch.Tensor(1)]: Depth classification network loss
+            loss (torch.Tensor): Depth classification network loss.
         """
         # Bin depth map to create target
         depth_maps = self.build_target_depth_from_3dcenter(depth_logits, gt_boxes2d, gt_center_depth, num_gt_per_img)

diff --git a/src/otx/algo/object_detection_3d/losses/monodetr_loss.py b/src/otx/algo/object_detection_3d/losses/monodetr_loss.py
@@ -29,11 +29,10 @@ def __init__(self, num_classes: int, weight_dict: dict, focal_alpha: float, grou
         """MonoDETRCriterion.
 
         Args:
-            num_classes: number of object categories, omitting the special no-object category
-            matcher: module able to compute a matching between targets and proposals
-            weight_dict: dict containing as key the names of the losses and as values their relative weight.
-            focal_alpha: alpha in Focal Loss
-            group_num: number of groups for data parallelism
+            num_classes (int): number of object categories, omitting the special no-object category.
+            weight_dict (dict): dict containing as key the names of the losses and as values their relative weight.
+            focal_alpha (float): alpha in Focal Loss.
+            group_num (int): number of groups for data parallelism.
         """
         super().__init__()
         self.num_classes = num_classes
@@ -47,7 +46,15 @@ def __init__(self, num_classes: int, weight_dict: dict, focal_alpha: float, grou
         self.group_num = group_num
 
     def loss_labels(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Classification loss."""
+        """Classification loss.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         src_logits = outputs["scores"]
 
         idx = self._get_src_permutation_idx(indices)
@@ -76,7 +83,15 @@ def loss_labels(self, outputs: dict, targets: list, indices: list, num_boxes: in
         return {"loss_ce": loss_ce}
 
     def loss_3dcenter(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute the loss for the 3D center prediction."""
+        """Compute the loss for the 3D center prediction.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         idx = self._get_src_permutation_idx(indices)
         src_3dcenter = outputs["boxes_3d"][:, :, 0:2][idx]
         target_3dcenter = torch.cat([t["boxes_3d"][:, 0:2][i] for t, (_, i) in zip(targets, indices)], dim=0)
@@ -85,7 +100,15 @@ def loss_3dcenter(self, outputs: dict, targets: list, indices: list, num_boxes:
         return {"loss_center": loss_3dcenter.sum() / num_boxes}
 
     def loss_boxes(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute l1 loss."""
+        """Compute l1 loss.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         idx = self._get_src_permutation_idx(indices)
         src_2dboxes = outputs["boxes_3d"][:, :, 2:6][idx]
         target_2dboxes = torch.cat([t["boxes_3d"][:, 2:6][i] for t, (_, i) in zip(targets, indices)], dim=0)
@@ -95,7 +118,15 @@ def loss_boxes(self, outputs: dict, targets: list, indices: list, num_boxes: int
         return {"loss_bbox": loss_bbox.sum() / num_boxes}
 
     def loss_giou(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute the GIoU loss."""
+        """Compute the GIoU loss.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         # giou
         idx = self._get_src_permutation_idx(indices)
         src_boxes = outputs["boxes_3d"][idx]
@@ -104,7 +135,15 @@ def loss_giou(self, outputs: dict, targets: list, indices: list, num_boxes: int)
         return {"loss_giou": loss_giou}
 
     def loss_depths(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute the loss for the depth prediction."""
+        """Compute the loss for the depth prediction.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch
+        """
         idx = self._get_src_permutation_idx(indices)
 
         src_depths = outputs["depth"][idx]
@@ -117,7 +156,15 @@ def loss_depths(self, outputs: dict, targets: list, indices: list, num_boxes: in
         return {"loss_depth": depth_loss.sum() / num_boxes}
 
     def loss_dims(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute the loss for the dimension prediction."""
+        """Compute the loss for the dimension prediction.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         idx = self._get_src_permutation_idx(indices)
         src_dims = outputs["size_3d"][idx]
         target_dims = torch.cat([t["size_3d"][i] for t, (_, i) in zip(targets, indices)], dim=0)
@@ -131,7 +178,15 @@ def loss_dims(self, outputs: dict, targets: list, indices: list, num_boxes: int)
         return {"loss_dim": dim_loss.sum() / num_boxes}
 
     def loss_angles(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute the loss for the angle prediction."""
+        """Compute the loss for the angle prediction.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         idx = self._get_src_permutation_idx(indices)
         heading_input = outputs["heading_angle"][idx]
         target_heading_angle = torch.cat([t["heading_angle"][i] for t, (_, i) in zip(targets, indices)], dim=0)
@@ -158,7 +213,15 @@ def loss_angles(self, outputs: dict, targets: list, indices: list, num_boxes: in
         return {"loss_angle": angle_loss.sum() / num_boxes}
 
     def loss_depth_map(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Depth map loss."""
+        """Depth map loss.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         depth_map_logits = outputs["pred_depth_map_logits"]
 
         num_gt_per_img = [len(t["boxes"]) for t in targets]
@@ -174,6 +237,7 @@ def _get_src_permutation_idx(
         self,
         indices: list[tuple[torch.Tensor, torch.Tensor]],
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Get the indices necessary to compute the loss."""
         # permute predictions following indices
         batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
         src_idx = torch.cat([src for (src, _) in indices])
@@ -183,6 +247,7 @@ def _get_tgt_permutation_idx(
         self,
         indices: list[tuple[torch.Tensor, torch.Tensor]],
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Get the indices necessary to compute the loss."""
         # permute targets following indices
         batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
         tgt_idx = torch.cat([tgt for (_, tgt) in indices])
@@ -210,9 +275,9 @@ def forward(
         """This performs the loss computation.
 
         Args:
-             outputs: dict of tensors, see the output specification of the model for the format
-             targets: list of dicts, such that len(targets) == batch_size.
-                      The expected keys in each dict depends on the losses applied, see each loss' doc
+             outputs (dict): dict of tensors, see the output specification of the model for the format.
+             targets (list): list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc.
         """
         outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
         group_num = self.group_num if self.training else 1