Skip to content

Commit

Permalink
Make mAP 3D more general (#4031)
Browse files Browse the repository at this point in the history
* added coco metric

* fix linter

* added ap_05

* fix perf test

* small fix

* fix some misc comments from previous PRs

* reply comments
  • Loading branch information
kprokofi authored Oct 18, 2024
1 parent fe32690 commit e92cdfd
Show file tree
Hide file tree
Showing 16 changed files with 403 additions and 459 deletions.
3 changes: 1 addition & 2 deletions src/otx/algo/detection/heads/rtdetr_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,7 @@ def forward(
value = self.value_proj(value)
if value_mask is not None:
value = value.masked_fill(value_mask[..., None], float(0))
# value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
# value3 = value * value_mask.unsqueeze(-1)

value = value.reshape(bs, len_v, self.num_heads, self.head_dim)

sampling_offsets = self.sampling_offsets(query).reshape(
Expand Down
4 changes: 2 additions & 2 deletions src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(
num_pos_feats (int): Number of positional features.
temperature (int): Temperature scaling factor.
normalize (bool): Flag indicating whether to normalize the position embeddings.
scale (Optional[float]): Scaling factor for the position embeddings. If None, default value is used.
scale (float | None): Scaling factor for the position embeddings. If None, default value is used.
"""
super().__init__()
self.num_pos_feats = num_pos_feats
Expand Down Expand Up @@ -132,7 +132,7 @@ def __init__(
Args:
backbone (nn.Module): The backbone module.
position_embedding (Union[PositionEmbeddingSine]): The position embedding module.
position_embedding (PositionEmbeddingSine): The position embedding module.
"""
super().__init__(backbone, position_embedding)
self.strides = backbone.strides
Expand Down
15 changes: 10 additions & 5 deletions src/otx/algo/object_detection_3d/detectors/monodetr.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,12 +149,17 @@ def forward(
"""Forward method of the MonoDETR model.
Args:
images (list[Tensor]): images for each sample
calibs (Tensor): camera matrices for each sample
img_sizes (Tensor): image sizes for each sample
targets (list[dict[Tensor]): ground truth boxes and labels for each
sample
images (Tensor): images for each sample.
calibs (Tensor): camera matrices for each sample.
img_sizes (Tensor): image sizes for each sample.
targets (list[dict[str, Tensor]): ground truth boxes and labels for each
sample. Defaults to None.
mode (str): The mode of operation. Defaults to "predict".
Returns:
dict[str, Tensor]: A dictionary of tensors. If mode is "loss", the
tensors are the loss values. If mode is "predict", the tensors are
the logits.
"""
features, pos = self.backbone(images)

Expand Down
2 changes: 2 additions & 0 deletions src/otx/algo/object_detection_3d/heads/depth_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def __init__(
depth_min (float): The minimum depth value.
depth_max (float): The maximum depth value.
hidden_dim (int): The dimension of the hidden layer.
activation (Callable[..., nn.Module], optional): The activation function.
Defaults to nn.ReLU.
"""
super().__init__()
self.depth_max = depth_max
Expand Down
73 changes: 35 additions & 38 deletions src/otx/algo/object_detection_3d/losses/ddn_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ def compute_fg_mask(
"""Compute foreground mask for images.
Args:
gt_boxes2d [torch.Tensor(B, N, 4)]: 2D box labels
shape [Tuple[int, int]]: Foreground mask desired shape
downsample_factor [int]: Downsample factor for image
device [torch.device]: Foreground mask desired device
gt_boxes2d (torch.Tensor): 2D box labels.
shape (Tuple[int, int]): Foreground mask desired shape.
downsample_factor (int): Downsample factor for image.
device (torch.device): Foreground mask desired device.
Returns:
fg_mask [torch.Tensor(shape)]: Foreground mask
fg_mask (torch.Tensor(shape)]: Foreground mask.
"""
if device is None:
device = torch.device("cpu")
Expand Down Expand Up @@ -58,9 +58,9 @@ def __init__(self, fg_weight: float, bg_weight: float, downsample_factor: int =
"""Initialize fixed foreground/background loss balancer.
Args:
fg_weight [float]: Foreground loss weight
bg_weight [float]: Background loss weight
downsample_factor [int]: Depth map downsample factor
fg_weight (float): Foreground loss weight.
bg_weight (float): Background loss weight.
downsample_factor (int): Depth map downsample factor.
"""
super().__init__()
self.fg_weight = fg_weight
Expand All @@ -76,12 +76,11 @@ def forward(
"""Forward pass.
Args:
loss [torch.Tensor(B, H, W)]: Pixel-wise loss
gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing
loss (torch.Tensor): Pixel-wise loss.
gt_boxes2d (torch.Tensor): 2D box labels for foreground/background balancing.
Returns:
loss [torch.Tensor(1)]: Total loss after foreground/background balancing
tb_dict [dict[float]]: All losses to log in tensorboard
loss (torch.Tensor): Total loss after foreground/background balancing.
"""
# Compute masks
fg_mask = compute_fg_mask(
Expand Down Expand Up @@ -120,13 +119,11 @@ def __init__(
"""Initializes DDNLoss module.
Args:
weight [float]: Loss function weight
alpha [float]: Alpha value for Focal Loss
gamma [float]: Gamma value for Focal Loss
disc_cfg [dict]: Depth discretiziation configuration
fg_weight [float]: Foreground loss weight
bg_weight [float]: Background loss weight
downsample_factor [int]: Depth map downsample factor
alpha (float): Alpha value for Focal Loss.
gamma (float): Gamma value for Focal Loss.
fg_weight (float): Foreground loss weight.
bg_weight (float): Background loss weight.
downsample_factor (int): Depth map downsample factor.
"""
super().__init__()
self.balancer = Balancer(downsample_factor=downsample_factor, fg_weight=fg_weight, bg_weight=bg_weight)
Expand All @@ -146,10 +143,10 @@ def build_target_depth_from_3dcenter(
"""Builds target depth map from 3D center depth.
Args:
depth_logits: torch.Tensor(B, D+1, H, W)]: Predicted depth logits
gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing
gt_center_depth [torch.Tensor(B, N)]: 3D center depth
num_gt_per_img: [int]: Number of ground truth boxes per image
depth_logits: (torch.Tensor): Predicted depth logits.
gt_boxes2d (torch.Tensor)): 2D box labels for foreground/background balancing.
gt_center_depth (torch.Tensor): 3D center depth.
num_gt_per_img: (int): Number of ground truth boxes per image.
"""
b, _, h, w = depth_logits.shape
depth_maps = torch.zeros((b, h, w), device=depth_logits.device, dtype=depth_logits.dtype)
Expand Down Expand Up @@ -185,18 +182,18 @@ def bin_depths(
"""Converts depth map into bin indices.
Args:
depth_map [torch.Tensor(H, W)]: Depth Map
mode [string]: Discretiziation mode (See https://arxiv.org/pdf/2005.13423.pdf for more details)
UD: Uniform discretiziation
LID: Linear increasing discretiziation
SID: Spacing increasing discretiziation
depth_min [float]: Minimum depth value
depth_max [float]: Maximum depth value
num_bins [int]: Number of depth bins
target [bool]: Whether the depth bins indices will be used for a target tensor in loss comparison
depth_map (torch.Tensor): Depth Map.
mode (string): Discretiziation mode (See https://arxiv.org/pdf/2005.13423.pdf for more details).
UD: Uniform discretiziation.
LID: Linear increasing discretiziation.
SID: Spacing increasing discretiziation.
depth_min (float): Minimum depth value.
depth_max (float): Maximum depth value.
num_bins (int): Number of depth bins.
target (bool): Whether the depth bins indices will be used for a target tensor in loss comparison.
Returns:
indices [torch.Tensor(H, W)]: Depth bin indices
indices (torch.Tensor): Depth bin indices.
"""
if mode == "UD":
bin_size = (depth_max - depth_min) / num_bins
Expand Down Expand Up @@ -233,13 +230,13 @@ def forward(
"""Gets depth_map loss.
Args:
depth_logits: torch.Tensor(B, D+1, H, W)]: Predicted depth logits
gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing
num_gt_per_img: [int]: Number of ground truth boxes per image
gt_center_depth: [torch.Tensor(B, N)]: 3D center depth
depth_logits: (torch.Tensor): Predicted depth logits.
gt_boxes2d (torch.Tensor): 2D box labels for foreground/background balancing.
num_gt_per_img: (int): Number of ground truth boxes per image.
gt_center_depth: (torch.Tensor): 3D center depth.
Returns:
loss [torch.Tensor(1)]: Depth classification network loss
loss (torch.Tensor): Depth classification network loss.
"""
# Bin depth map to create target
depth_maps = self.build_target_depth_from_3dcenter(depth_logits, gt_boxes2d, gt_center_depth, num_gt_per_img)
Expand Down
97 changes: 81 additions & 16 deletions src/otx/algo/object_detection_3d/losses/monodetr_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,10 @@ def __init__(self, num_classes: int, weight_dict: dict, focal_alpha: float, grou
"""MonoDETRCriterion.
Args:
num_classes: number of object categories, omitting the special no-object category
matcher: module able to compute a matching between targets and proposals
weight_dict: dict containing as key the names of the losses and as values their relative weight.
focal_alpha: alpha in Focal Loss
group_num: number of groups for data parallelism
num_classes (int): number of object categories, omitting the special no-object category.
weight_dict (dict): dict containing as key the names of the losses and as values their relative weight.
focal_alpha (float): alpha in Focal Loss.
group_num (int): number of groups for data parallelism.
"""
super().__init__()
self.num_classes = num_classes
Expand All @@ -47,7 +46,15 @@ def __init__(self, num_classes: int, weight_dict: dict, focal_alpha: float, grou
self.group_num = group_num

def loss_labels(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
"""Classification loss."""
"""Classification loss.
Args:
outputs (dict): dict of tensors, see the output specification of the model for the format.
targets (list): list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc.
indices (list): list of tuples, such that len(indices) == batch_size.
num_boxes (int): number of boxes in the batch.
"""
src_logits = outputs["scores"]

idx = self._get_src_permutation_idx(indices)
Expand Down Expand Up @@ -76,7 +83,15 @@ def loss_labels(self, outputs: dict, targets: list, indices: list, num_boxes: in
return {"loss_ce": loss_ce}

def loss_3dcenter(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
"""Compute the loss for the 3D center prediction."""
"""Compute the loss for the 3D center prediction.
Args:
outputs (dict): dict of tensors, see the output specification of the model for the format.
targets (list): list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc.
indices (list): list of tuples, such that len(indices) == batch_size.
num_boxes (int): number of boxes in the batch.
"""
idx = self._get_src_permutation_idx(indices)
src_3dcenter = outputs["boxes_3d"][:, :, 0:2][idx]
target_3dcenter = torch.cat([t["boxes_3d"][:, 0:2][i] for t, (_, i) in zip(targets, indices)], dim=0)
Expand All @@ -85,7 +100,15 @@ def loss_3dcenter(self, outputs: dict, targets: list, indices: list, num_boxes:
return {"loss_center": loss_3dcenter.sum() / num_boxes}

def loss_boxes(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
"""Compute l1 loss."""
"""Compute l1 loss.
Args:
outputs (dict): dict of tensors, see the output specification of the model for the format.
targets (list): list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc.
indices (list): list of tuples, such that len(indices) == batch_size.
num_boxes (int): number of boxes in the batch.
"""
idx = self._get_src_permutation_idx(indices)
src_2dboxes = outputs["boxes_3d"][:, :, 2:6][idx]
target_2dboxes = torch.cat([t["boxes_3d"][:, 2:6][i] for t, (_, i) in zip(targets, indices)], dim=0)
Expand All @@ -95,7 +118,15 @@ def loss_boxes(self, outputs: dict, targets: list, indices: list, num_boxes: int
return {"loss_bbox": loss_bbox.sum() / num_boxes}

def loss_giou(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
"""Compute the GIoU loss."""
"""Compute the GIoU loss.
Args:
outputs (dict): dict of tensors, see the output specification of the model for the format.
targets (list): list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc.
indices (list): list of tuples, such that len(indices) == batch_size.
num_boxes (int): number of boxes in the batch.
"""
# giou
idx = self._get_src_permutation_idx(indices)
src_boxes = outputs["boxes_3d"][idx]
Expand All @@ -104,7 +135,15 @@ def loss_giou(self, outputs: dict, targets: list, indices: list, num_boxes: int)
return {"loss_giou": loss_giou}

def loss_depths(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
"""Compute the loss for the depth prediction."""
"""Compute the loss for the depth prediction.
Args:
outputs (dict): dict of tensors, see the output specification of the model for the format.
targets (list): list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc.
indices (list): list of tuples, such that len(indices) == batch_size.
num_boxes (int): number of boxes in the batch
"""
idx = self._get_src_permutation_idx(indices)

src_depths = outputs["depth"][idx]
Expand All @@ -117,7 +156,15 @@ def loss_depths(self, outputs: dict, targets: list, indices: list, num_boxes: in
return {"loss_depth": depth_loss.sum() / num_boxes}

def loss_dims(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
"""Compute the loss for the dimension prediction."""
"""Compute the loss for the dimension prediction.
Args:
outputs (dict): dict of tensors, see the output specification of the model for the format.
targets (list): list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc.
indices (list): list of tuples, such that len(indices) == batch_size.
num_boxes (int): number of boxes in the batch.
"""
idx = self._get_src_permutation_idx(indices)
src_dims = outputs["size_3d"][idx]
target_dims = torch.cat([t["size_3d"][i] for t, (_, i) in zip(targets, indices)], dim=0)
Expand All @@ -131,7 +178,15 @@ def loss_dims(self, outputs: dict, targets: list, indices: list, num_boxes: int)
return {"loss_dim": dim_loss.sum() / num_boxes}

def loss_angles(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
"""Compute the loss for the angle prediction."""
"""Compute the loss for the angle prediction.
Args:
outputs (dict): dict of tensors, see the output specification of the model for the format.
targets (list): list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc.
indices (list): list of tuples, such that len(indices) == batch_size.
num_boxes (int): number of boxes in the batch.
"""
idx = self._get_src_permutation_idx(indices)
heading_input = outputs["heading_angle"][idx]
target_heading_angle = torch.cat([t["heading_angle"][i] for t, (_, i) in zip(targets, indices)], dim=0)
Expand All @@ -158,7 +213,15 @@ def loss_angles(self, outputs: dict, targets: list, indices: list, num_boxes: in
return {"loss_angle": angle_loss.sum() / num_boxes}

def loss_depth_map(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
"""Depth map loss."""
"""Depth map loss.
Args:
outputs (dict): dict of tensors, see the output specification of the model for the format.
targets (list): list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc.
indices (list): list of tuples, such that len(indices) == batch_size.
num_boxes (int): number of boxes in the batch.
"""
depth_map_logits = outputs["pred_depth_map_logits"]

num_gt_per_img = [len(t["boxes"]) for t in targets]
Expand All @@ -174,6 +237,7 @@ def _get_src_permutation_idx(
self,
indices: list[tuple[torch.Tensor, torch.Tensor]],
) -> tuple[torch.Tensor, torch.Tensor]:
"""Get the indices necessary to compute the loss."""
# permute predictions following indices
batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
src_idx = torch.cat([src for (src, _) in indices])
Expand All @@ -183,6 +247,7 @@ def _get_tgt_permutation_idx(
self,
indices: list[tuple[torch.Tensor, torch.Tensor]],
) -> tuple[torch.Tensor, torch.Tensor]:
"""Get the indices necessary to compute the loss."""
# permute targets following indices
batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
tgt_idx = torch.cat([tgt for (_, tgt) in indices])
Expand Down Expand Up @@ -210,9 +275,9 @@ def forward(
"""This performs the loss computation.
Args:
outputs: dict of tensors, see the output specification of the model for the format
targets: list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc
outputs (dict): dict of tensors, see the output specification of the model for the format.
targets (list): list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc.
"""
outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
group_num = self.group_num if self.training else 1
Expand Down
Loading

0 comments on commit e92cdfd

Please sign in to comment.