diff --git a/.github/workflows/pre_merge.yaml b/.github/workflows/pre_merge.yaml index 1039ad01669..75d9095e31b 100644 --- a/.github/workflows/pre_merge.yaml +++ b/.github/workflows/pre_merge.yaml @@ -48,9 +48,7 @@ jobs: include: - python-version: "3.10" tox-env: "py310" - # TODO(vinnamki): Revisit after fixing in the upstream: https://github.com/omni-us/jsonargparse/issues/484 - # Ticket no. 138075 - - python-version: "3.11.8" + - python-version: "3.11" tox-env: "py311" name: Unit-Test-with-Python${{ matrix.python-version }} steps: @@ -112,6 +110,7 @@ jobs: - task: "anomaly_detection" - task: "anomaly_segmentation" - task: "keypoint_detection" + - task: "object_detection_3d" name: Integration-Test-${{ matrix.task }}-py310 steps: - name: Checkout repository diff --git a/pyproject.toml b/pyproject.toml index 61f8c6f783e..b1ab8f0e6ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", ] dependencies = [ - "datumaro==1.7.0", + "datumaro==1.10.0rc0", "omegaconf==2.3.0", "rich==13.8.0", "jsonargparse==4.30.0", @@ -39,6 +39,7 @@ dependencies = [ "einops==0.8.0", "decord==0.6.0", "typeguard==4.3.*", + "numba==0.60.0", # TODO(ashwinvaidya17): https://github.com/openvinotoolkit/anomalib/issues/2126 "setuptools<70", ] diff --git a/src/otx/algo/common/layers/transformer_layers.py b/src/otx/algo/common/layers/transformer_layers.py new file mode 100644 index 00000000000..0c3ede9116a --- /dev/null +++ b/src/otx/algo/common/layers/transformer_layers.py @@ -0,0 +1,122 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Implementation of common transformer layers.""" + +from __future__ import annotations + +import copy +from typing import Callable + +import torch +from torch import nn + + +class TransformerEncoderLayer(nn.Module): + """TransformerEncoderLayer.""" + + def __init__( + self, + d_model: int, + nhead: int, + dim_feedforward: int = 2048, + dropout: float = 0.1, + activation: Callable[..., nn.Module] = nn.GELU, + normalize_before: bool = False, + batch_first: bool = True, + key_mask: bool = False, + ) -> None: + super().__init__() + self.normalize_before = normalize_before + self.key_mask = key_mask + + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout, batch_first=batch_first) + + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.activation = activation() + + @staticmethod + def with_pos_embed(tensor: torch.Tensor, pos_embed: torch.Tensor | None) -> torch.Tensor: + """Attach position embeddings to the tensor.""" + return tensor if pos_embed is None else tensor + pos_embed + + def forward( + self, + src: torch.Tensor, + src_mask: torch.Tensor | None = None, + pos_embed: torch.Tensor | None = None, + ) -> torch.Tensor: + """Forward the transformer encoder layer. + + Args: + src (torch.Tensor): The input tensor. + src_mask (torch.Tensor | None, optional): The mask tensor. Defaults to None. + pos_embed (torch.Tensor | None, optional): The position embedding tensor. Defaults to None. + """ + residual = src + if self.normalize_before: + src = self.norm1(src) + q = k = self.with_pos_embed(src, pos_embed) + if self.key_mask: + src = self.self_attn(q, k, value=src, key_padding_mask=src_mask)[0] + else: + src, _ = self.self_attn(q, k, value=src, attn_mask=src_mask) + + src = residual + self.dropout1(src) + if not self.normalize_before: + src = self.norm1(src) + + residual = src + if self.normalize_before: + src = self.norm2(src) + src = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = residual + self.dropout2(src) + if not self.normalize_before: + src = self.norm2(src) + return src + + +class TransformerEncoder(nn.Module): + """TransformerEncoder.""" + + def __init__(self, encoder_layer: nn.Module, num_layers: int, norm: nn.Module | None = None) -> None: + """Initialize the TransformerEncoder. + + Args: + encoder_layer (nn.Module): The encoder layer module. + num_layers (int): The number of layers. + norm (nn.Module | None, optional): The normalization module. Defaults to None. + """ + super().__init__() + self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(num_layers)]) + self.num_layers = num_layers + self.norm = norm + + def forward( + self, + src: torch.Tensor, + src_mask: torch.Tensor | None = None, + pos_embed: torch.Tensor | None = None, + ) -> torch.Tensor: + """Forward the transformer encoder. + + Args: + src (torch.Tensor): The input tensor. + src_mask (torch.Tensor | None, optional): The mask tensor. Defaults to None. + pos_embed (torch.Tensor | None, optional): The position embedding tensor. Defaults to None. + """ + output = src + for layer in self.layers: + output = layer(output, src_mask=src_mask, pos_embed=pos_embed) + + if self.norm is not None: + output = self.norm(output) + + return output diff --git a/src/otx/algo/common/losses/focal_loss.py b/src/otx/algo/common/losses/focal_loss.py index 9ad2f1323b2..4eb3914957f 100644 --- a/src/otx/algo/common/losses/focal_loss.py +++ b/src/otx/algo/common/losses/focal_loss.py @@ -8,11 +8,12 @@ from __future__ import annotations +import warnings from typing import TYPE_CHECKING import torch -import torch.nn.functional from otx.algo.common.losses.utils import weight_reduce_loss +from torch import nn if TYPE_CHECKING: from torch import Tensor @@ -50,7 +51,7 @@ def py_sigmoid_focal_loss( pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target) # Thus it's pt.pow(gamma) rather than (1 - pt).pow(gamma) focal_weight = (alpha * target + (1 - alpha) * (1 - target)) * pt.pow(gamma) - loss = torch.nn.functional.binary_cross_entropy_with_logits(pred, target, reduction="none") * focal_weight + loss = nn.functional.binary_cross_entropy_with_logits(pred, target, reduction="none") * focal_weight if weight is not None: if weight.shape != loss.shape: if weight.size(0) == loss.size(0): @@ -70,3 +71,180 @@ def py_sigmoid_focal_loss( msg = "The number of dimensions in weight should be equal to the number of dimensions in loss." raise ValueError(msg) return weight_reduce_loss(loss, weight, reduction, avg_factor) + + +def one_hot( + labels: torch.Tensor, + num_classes: int, + device: torch.device | None = None, + dtype: torch.dtype | None = None, + eps: float = 1e-6, +) -> torch.Tensor: + r"""Convert an integer label x-D tensor to a one-hot (x+1)-D tensor. + + Args: + labels: tensor with labels of shape :math:`(N, *)`, where N is batch size. + Each value is an integer representing correct classification. + num_classes: number of classes in labels. + device: the desired device of returned tensor. + dtype: the desired data type of returned tensor. + + Returns: + the labels in one hot tensor of shape :math:`(N, C, *)`, + + Examples: + >>> labels = torch.LongTensor([[[0, 1], [2, 0]]]) + >>> one_hot(labels, num_classes=3) + tensor([[[[1.0000e+00, 1.0000e-06], + [1.0000e-06, 1.0000e+00]], + + [[1.0000e-06, 1.0000e+00], + [1.0000e-06, 1.0000e-06]], + + [[1.0000e-06, 1.0000e-06], + [1.0000e+00, 1.0000e-06]]]]) + """ + if not isinstance(labels, torch.Tensor): + msg = f"Input labels type is not a torch.Tensor. Got {type(labels)}" + raise TypeError(msg) + + if labels.dtype != torch.int64: + msg = f"labels must be of the same dtype torch.int64. Got: {labels.dtype}" + raise ValueError(msg) + + if num_classes < 1: + msg = f"The number of classes must be bigger than one. Got: {num_classes}" + raise ValueError(msg) + shape = labels.shape + one_hot = torch.zeros((shape[0], num_classes) + shape[1:], device=device, dtype=dtype) + return one_hot.scatter_(1, labels.unsqueeze(1), 1.0) + eps + + +def focal_loss( + inputs: torch.Tensor, + target: torch.Tensor, + alpha: float, + gamma: float = 2.0, + reduction: str = "none", + eps: float | None = None, +) -> torch.Tensor: + r"""Criterion that computes Focal loss. + + According to :cite:`lin2018focal`, the Focal loss is computed as follows: + .. math:: + \text{FL}(p_t) = -\alpha_t (1 - p_t)^{\gamma} \, \text{log}(p_t) + Where: + - :math:`p_t` is the model's estimated probability for each class. + + Args: + inputs: logits tensor with shape :math:`(N, C, *)` where C = number of classes. + target: labels tensor with shape :math:`(N, *)` where each value is :math:`0 ≤ targets[i] ≤ C-1`. + alpha: Weighting factor :math:`\alpha \in [0, 1]`. + gamma: Focusing parameter :math:`\gamma >= 0`. + reduction: Specifies the reduction to apply to the + output: ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction + will be applied, ``'mean'``: the sum of the output will be divided by + the number of elements in the output, ``'sum'``: the output will be + summed. + eps: Deprecated: scalar to enforce numerical stabiliy. This is no longer used. + + Return: + the computed loss. + + Example: + >>> N = 5 # num_classes + >>> inputs = torch.randn(1, N, 3, 5, requires_grad=True) + >>> target = torch.empty(1, 3, 5, dtype=torch.long).random_(N) + >>> output = focal_loss(inputs, target, alpha=0.5, gamma=2.0, reduction='mean') + >>> output.backward() + """ + if eps is not None and not torch.jit.is_scripting(): + warnings.warn( + "`focal_loss` has been reworked for improved numerical stability " + "and the `eps` argument is no longer necessary", + DeprecationWarning, + stacklevel=2, + ) + + if not isinstance(inputs, torch.Tensor): + msg = f"inputs type is not a torch.Tensor. Got {type(inputs)}" + raise TypeError(msg) + + if not len(inputs.shape) >= 2: + msg = f"Invalid inputs shape, we expect BxCx*. Got: {inputs.shape}" + raise ValueError(msg) + + if inputs.size(0) != target.size(0): + msg = f"Expected inputs batch_size ({inputs.size(0)}) to match target batch_size ({target.size(0)})." + raise ValueError(msg) + + n = inputs.size(0) + out_size = (n,) + inputs.size()[2:] + if target.size()[1:] != inputs.size()[2:]: + msg = f"Expected target size {out_size}, got {target.size()}" + raise ValueError(msg) + + if inputs.device != target.device: + msg = f"inputs and target must be in the same device. Got: {inputs.device} and {target.device}" + raise ValueError(msg) + + # compute softmax over the classes axis + input_soft: torch.Tensor = nn.functional.softmax(inputs, dim=1) + log_input_soft: torch.Tensor = nn.functional.log_softmax(inputs, dim=1) + # create the labels one hot tensor + target_one_hot: torch.Tensor = one_hot( + target, + num_classes=inputs.shape[1], + device=inputs.device, + dtype=inputs.dtype, + ) + + # compute the actual focal loss + weight = torch.pow(-input_soft + 1.0, gamma) + + focal = -alpha * weight * log_input_soft + loss_tmp = torch.einsum("bc...,bc...->b...", (target_one_hot, focal)) + return weight_reduce_loss(loss_tmp, reduction=reduction, avg_factor=None) + + +class FocalLoss(nn.Module): + """Criterion that computes Focal loss.""" + + def __init__(self, alpha: float, gamma: float = 2.0, reduction: str = "none", eps: float | None = None) -> None: + r"""Criterion that computes Focal loss. + + According to :cite:`lin2018focal`, the Focal loss is computed as follows: + .. math:: + \text{FL}(p_t) = -\alpha_t (1 - p_t)^{\\gamma} \\, \text{log}(p_t) + Where: + - :math:`p_t` is the model's estimated probability for each class. + + Args: + alpha: Weighting factor :math:`\alpha \\in [0, 1]`. + gamma: Focusing parameter :math:`\\gamma >= 0`. + reduction: Specifies the reduction to apply to the + output: ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction + will be applied, ``'mean'``: the sum of the output will be divided by + the number of elements in the output, ``'sum'``: the output will be + summed. + eps: Deprecated: scalar to enforce numerical stability. This is no longer + used. + + Example: + >>> N = 5 # num_classes + >>> kwargs = {"alpha": 0.5, "gamma": 2.0, "reduction": 'mean'} + >>> criterion = FocalLoss(**kwargs) + >>> input = torch.randn(1, N, 3, 5, requires_grad=True) + >>> target = torch.empty(1, 3, 5, dtype=torch.long).random_(N) + >>> output = criterion(input, target) + >>> output.backward() + """ + super().__init__() + self.alpha: float = alpha + self.gamma: float = gamma + self.reduction: str = reduction + self.eps: float | None = eps + + def forward(self, inputs: torch.Tensor, target: torch.Tensor) -> torch.Tensor: + """Forward.""" + return focal_loss(inputs, target, self.alpha, self.gamma, self.reduction, self.eps) diff --git a/src/otx/algo/detection/heads/rtdetr_decoder.py b/src/otx/algo/detection/heads/rtdetr_decoder.py index 2d190dcaf32..d60be84f12d 100644 --- a/src/otx/algo/detection/heads/rtdetr_decoder.py +++ b/src/otx/algo/detection/heads/rtdetr_decoder.py @@ -213,7 +213,7 @@ def forward( query: torch.Tensor, reference_points: torch.Tensor, value: torch.Tensor, - value_spatial_shapes: list[tuple[int, int]], + value_spatial_shapes: torch.Tensor, value_mask: torch.Tensor | None = None, ) -> torch.Tensor: """Forward function of MSDeformableAttention. @@ -235,8 +235,9 @@ def forward( value = self.value_proj(value) if value_mask is not None: - value_mask = value_mask.astype(value.dtype).unsqueeze(-1) - value *= value_mask + value = value.masked_fill(value_mask[..., None], float(0)) + # value_mask = value_mask.astype(value.dtype).unsqueeze(-1) + # value3 = value * value_mask.unsqueeze(-1) value = value.reshape(bs, len_v, self.num_heads, self.head_dim) sampling_offsets = self.sampling_offsets(query).reshape( @@ -262,7 +263,7 @@ def forward( ) if reference_points.shape[-1] == 2: - offset_normalizer = torch.tensor(value_spatial_shapes) + offset_normalizer = value_spatial_shapes.clone() offset_normalizer = offset_normalizer.flip([1]).reshape(1, 1, 1, self.num_levels, 1, 2) sampling_locations = ( reference_points.reshape( @@ -280,6 +281,14 @@ def forward( reference_points[:, :, None, :, None, :2] + sampling_offsets / self.num_points * reference_points[:, :, None, :, None, 2:] * 0.5 ) + elif reference_points.shape[-1] == 6: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + + sampling_offsets + / self.num_points + * (reference_points[:, :, None, :, None, 2::2] + reference_points[:, :, None, :, None, 3::2]) + * 0.5 + ) else: msg = f"Last dim of reference_points must be 2 or 4, but get {reference_points.shape[-1]} instead." raise ValueError( diff --git a/src/otx/algo/detection/necks/hybrid_encoder.py b/src/otx/algo/detection/necks/hybrid_encoder.py index cf79424636f..548a14548ec 100644 --- a/src/otx/algo/detection/necks/hybrid_encoder.py +++ b/src/otx/algo/detection/necks/hybrid_encoder.py @@ -12,6 +12,7 @@ import torch from torch import nn +from otx.algo.common.layers.transformer_layers import TransformerEncoder, TransformerEncoderLayer from otx.algo.detection.layers import CSPRepLayer from otx.algo.modules import Conv2dModule, build_activation_layer from otx.algo.modules.base_module import BaseModule @@ -20,85 +21,6 @@ __all__ = ["HybridEncoder"] -# transformer -class TransformerEncoderLayer(nn.Module): - def __init__( - self, - d_model: int, - nhead: int, - dim_feedforward: int = 2048, - dropout: float = 0.1, - activation: Callable[..., nn.Module] = nn.GELU, - normalize_before: bool = False, - ) -> None: - super().__init__() - self.normalize_before = normalize_before - - self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout, batch_first=True) - - self.linear1 = nn.Linear(d_model, dim_feedforward) - self.dropout = nn.Dropout(dropout) - self.linear2 = nn.Linear(dim_feedforward, d_model) - - self.norm1 = nn.LayerNorm(d_model) - self.norm2 = nn.LayerNorm(d_model) - self.dropout1 = nn.Dropout(dropout) - self.dropout2 = nn.Dropout(dropout) - self.activation = activation() - - @staticmethod - def with_pos_embed(tensor: torch.Tensor, pos_embed: torch.Tensor | None) -> torch.Tensor: - return tensor if pos_embed is None else tensor + pos_embed - - def forward( - self, - src: torch.Tensor, - src_mask: torch.Tensor | None = None, - pos_embed: torch.Tensor | None = None, - ) -> torch.Tensor: - residual = src - if self.normalize_before: - src = self.norm1(src) - q = k = self.with_pos_embed(src, pos_embed) - src, _ = self.self_attn(q, k, value=src, attn_mask=src_mask) - - src = residual + self.dropout1(src) - if not self.normalize_before: - src = self.norm1(src) - - residual = src - if self.normalize_before: - src = self.norm2(src) - src = self.linear2(self.dropout(self.activation(self.linear1(src)))) - src = residual + self.dropout2(src) - if not self.normalize_before: - src = self.norm2(src) - return src - - -class TransformerEncoder(nn.Module): - def __init__(self, encoder_layer: nn.Module, num_layers: int, norm: nn.Module | None = None) -> None: - super().__init__() - self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(num_layers)]) - self.num_layers = num_layers - self.norm = norm - - def forward( - self, - src: torch.Tensor, - src_mask: torch.Tensor | None = None, - pos_embed: torch.Tensor | None = None, - ) -> torch.Tensor: - output = src - for layer in self.layers: - output = layer(output, src_mask=src_mask, pos_embed=pos_embed) - - if self.norm is not None: - output = self.norm(output) - - return output - - class HybridEncoderModule(BaseModule): """HybridEncoder for RTDetr. diff --git a/src/otx/algo/object_detection_3d/__init__.py b/src/otx/algo/object_detection_3d/__init__.py new file mode 100644 index 00000000000..c9797fe2795 --- /dev/null +++ b/src/otx/algo/object_detection_3d/__init__.py @@ -0,0 +1,8 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Custom model implementations for object detection 3D task.""" + +from . import backbones, detectors, heads, losses, matchers, utils + +__all__ = ["backbones", "heads", "losses", "detectors", "matchers", "utils"] diff --git a/src/otx/algo/object_detection_3d/backbones/__init__.py b/src/otx/algo/object_detection_3d/backbones/__init__.py new file mode 100644 index 00000000000..a7d354222db --- /dev/null +++ b/src/otx/algo/object_detection_3d/backbones/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Backbones modules for 3d object detection.""" diff --git a/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py b/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py new file mode 100644 index 00000000000..0d345aa11a5 --- /dev/null +++ b/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py @@ -0,0 +1,253 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""MonoDetr backbone implementations.""" +from __future__ import annotations + +import math +from typing import Any, ClassVar + +import torch +import torchvision +from torch import nn +from torchvision.models._utils import IntermediateLayerGetter + +from otx.algo.modules.norm import FrozenBatchNorm2d +from otx.algo.object_detection_3d.utils.utils import NestedTensor + + +class PositionEmbeddingSine(nn.Module): + """This is a more standard version of the position embedding.""" + + def __init__( + self, + num_pos_feats: int = 64, + temperature: int = 10000, + normalize: bool = False, + scale: float | None = None, + ): + """Initialize the PositionEmbeddingSine module. + + Args: + num_pos_feats (int): Number of positional features. + temperature (int): Temperature scaling factor. + normalize (bool): Flag indicating whether to normalize the position embeddings. + scale (Optional[float]): Scaling factor for the position embeddings. If None, default value is used. + """ + super().__init__() + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + msg = "normalize should be True if scale is passed" + raise ValueError(msg) + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, tensor_list: NestedTensor) -> torch.Tensor: + """Forward function for PositionEmbeddingSine module.""" + x = tensor_list.tensors + mask = tensor_list.mask + not_mask = ~mask + y_embed = not_mask.cumsum(1, dtype=torch.float32) + x_embed = not_mask.cumsum(2, dtype=torch.float32) + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + return torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + + +class PositionEmbeddingLearned(nn.Module): + """Absolute pos embedding, learned.""" + + def __init__(self, num_pos_feats: int = 256): + """Positional embedding.""" + super().__init__() + self.row_embed = nn.Embedding(50, num_pos_feats) + self.col_embed = nn.Embedding(50, num_pos_feats) + + def forward(self, tensor_list: NestedTensor) -> torch.Tensor: + """Forward pass of the PositionEmbeddingLearned module. + + Args: + tensor_list (NestedTensor): Input tensor. + + Returns: + torch.Tensor: Position embeddings. + """ + x = tensor_list.tensors + h, w = x.shape[-2:] + i = torch.arange(w, device=x.device) / w * 49 + j = torch.arange(h, device=x.device) / h * 49 + x_emb = self.get_embed(i, self.col_embed) + y_emb = self.get_embed(j, self.row_embed) + return ( + torch.cat( + [ + x_emb.unsqueeze(0).repeat(h, 1, 1), + y_emb.unsqueeze(1).repeat(1, w, 1), + ], + dim=-1, + ) + .permute(2, 0, 1) + .unsqueeze(0) + .repeat(x.shape[0], 1, 1, 1) + ) + + def get_embed(self, coord: torch.Tensor, embed: nn.Embedding) -> torch.Tensor: + """Get the embedding for the given coordinates. + + Args: + coord (torch.Tensor): The coordinates. + embed (nn.Embedding): The embedding layer. + + Returns: + torch.Tensor: The embedding for the coordinates. + """ + floor_coord = coord.floor() + delta = (coord - floor_coord).unsqueeze(-1) + floor_coord = floor_coord.long() + ceil_coord = (floor_coord + 1).clamp(max=49) + return embed(floor_coord) * (1 - delta) + embed(ceil_coord) * delta + + +def build_position_encoding( + hidden_dim: int, + position_embedding: str | PositionEmbeddingSine | PositionEmbeddingLearned, +) -> PositionEmbeddingSine | PositionEmbeddingLearned: + """Build the position encoding module. + + Args: + hidden_dim (int): The hidden dimension. + position_embedding (Union[str, PositionEmbeddingSine, PositionEmbeddingLearned]): The position embedding type. + + Returns: + Union[PositionEmbeddingSine, PositionEmbeddingLearned]: The position encoding module. + """ + n_steps = hidden_dim // 2 + if position_embedding in ("v2", "sine"): + position_embedding = PositionEmbeddingSine(n_steps, normalize=True) + elif position_embedding in ("v3", "learned"): + position_embedding = PositionEmbeddingLearned(n_steps) + else: + msg = f"not supported {position_embedding}" + raise ValueError(msg) + + return position_embedding + + +class BackboneBase(nn.Module): + """BackboneBase module.""" + + def __init__(self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool): + """Initializes BackboneBase module.""" + super().__init__() + for name, parameter in backbone.named_parameters(): + if not train_backbone or "layer2" not in name and "layer3" not in name and "layer4" not in name: + parameter.requires_grad_(False) + if return_interm_layers: + return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"} + self.strides = [8, 16, 32] + self.num_channels = [512, 1024, 2048] + else: + return_layers = {"layer4": "0"} + self.strides = [32] + self.num_channels = [2048] + self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) + + def forward(self, images: torch.Tensor) -> dict[str, NestedTensor]: + """Forward pass of the BackboneBase module. + + Args: + images (torch.Tensor): Input images. + + Returns: + dict[str, NestedTensor]: Output tensors. + """ + xs = self.body(images) + out = {} + for name, x in xs.items(): + m = torch.zeros(x.shape[0], x.shape[2], x.shape[3]).to(torch.bool).to(x.device) + out[name] = NestedTensor(x, m) + return out + + +class Backbone(BackboneBase): + """ResNet backbone with frozen BatchNorm.""" + + def __init__(self, name: str, train_backbone: bool, return_interm_layers: bool, dilation: bool, **kwargs): + """Initializes Backbone module.""" + norm_layer = FrozenBatchNorm2d + backbone = getattr(torchvision.models, name)( + replace_stride_with_dilation=[False, False, dilation], + pretrained=True, + norm_layer=norm_layer, + ) + super().__init__(backbone, train_backbone, return_interm_layers) + if dilation: + self.strides[-1] = self.strides[-1] // 2 + + +class Joiner(nn.Sequential): + """Joiner module.""" + + def __init__( + self, + backbone: nn.Module, + position_embedding: PositionEmbeddingSine | PositionEmbeddingLearned, + ) -> None: + """Initialize the Joiner module. + + Args: + backbone (nn.Module): The backbone module. + position_embedding (Union[PositionEmbeddingSine, PositionEmbeddingLearned]): The position embedding module. + """ + super().__init__(backbone, position_embedding) + self.strides = backbone.strides + self.num_channels = backbone.num_channels + + def forward(self, images: torch.Tensor) -> tuple[list[NestedTensor], list[torch.Tensor]]: + """Forward pass of the Joiner module. + + Args: + images (torch.Tensor): Input images. + + Returns: + tuple[List[NestedTensor], List[torch.Tensor]]: Output tensors and position embeddings. + """ + out: list[NestedTensor] = [x for _, x in sorted(self[0](images).items())] + return out, [self[1](x).to(x.tensors.dtype) for x in out] + + +class BackboneBuilder: + """DepthAwareTransformerBuilder.""" + + CFG: ClassVar[dict[str, Any]] = { + "monodetr_50": { + "name": "resnet50", + "train_backbone": True, + "dilation": False, + "return_interm_layers": True, + "positional_encoding": { + "hidden_dim": 256, + "position_embedding": "sine", + }, + }, + } + + def __new__(cls, model_name: str) -> Joiner: + """Constructor for Backbone MonoDetr.""" + # TODO (Kirill): change backbone to already implemented in OTX + backbone = Backbone(**cls.CFG[model_name]) + position_embedding = build_position_encoding(**cls.CFG[model_name]["positional_encoding"]) + return Joiner(backbone, position_embedding) diff --git a/src/otx/algo/object_detection_3d/detectors/__init__.py b/src/otx/algo/object_detection_3d/detectors/__init__.py new file mode 100644 index 00000000000..9cbb7eee8fc --- /dev/null +++ b/src/otx/algo/object_detection_3d/detectors/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Core torch detectors modules for 3d object detection.""" diff --git a/src/otx/algo/object_detection_3d/detectors/monodetr.py b/src/otx/algo/object_detection_3d/detectors/monodetr.py new file mode 100644 index 00000000000..b94c2dd2b58 --- /dev/null +++ b/src/otx/algo/object_detection_3d/detectors/monodetr.py @@ -0,0 +1,313 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""MonoDetr core Pytorch detector.""" +from __future__ import annotations + +import math +from typing import Callable + +import torch +from torch import Tensor, nn +from torch.nn import functional + +from otx.algo.common.utils.utils import inverse_sigmoid +from otx.algo.detection.heads.rtdetr_decoder import MLP +from otx.algo.object_detection_3d.utils.utils import NestedTensor, get_clones + + +# TODO (Kirill): make MonoDETR as a more general class +class MonoDETR(nn.Module): + """This is the MonoDETR module that performs monocualr 3D object detection.""" + + def __init__( + self, + backbone: nn.Module, + depthaware_transformer: nn.Module, + depth_predictor: nn.Module, + criterion: nn.Module, + num_classes: int, + num_queries: int, + num_feature_levels: int, + aux_loss: bool = True, + with_box_refine: bool = False, + init_box: bool = False, + group_num: int = 11, + activation: Callable[..., nn.Module] = nn.ReLU, + ): + """Initializes the model. + + Args: + backbone (nn.Module): torch module of the backbone to be used. See backbone.py + depthaware_transformer (nn.Module): depth-aware transformer architecture. See depth_aware_transformer.py + depth_predictor (nn.Module): depth predictor module + criterion (nn.Module): loss criterion module + num_classes (int): number of object classes + num_queries (int): number of object queries, ie detection slot. This is the maximal number of objects + DETR can detect in a single image. For KITTI, we recommend 50 queries. + num_feature_levels (int): number of feature levels + aux_loss (bool): True if auxiliary decoding losses (loss at each decoder layer) are to be used. + with_box_refine (bool): iterative bounding box refinement + init_box (bool): True if the bounding box embedding layers should be initialized to zero + group_num (int): number of groups for depth-aware bounding box embedding + activation (Callable[..., nn.Module]): activation function to be applied to the output of the transformer + """ + super().__init__() + + self.num_queries = num_queries + self.depthaware_transformer = depthaware_transformer + self.depth_predictor = depth_predictor + hidden_dim = depthaware_transformer.d_model + self.hidden_dim = hidden_dim + self.num_feature_levels = num_feature_levels + self.criterion = criterion + self.label_enc = nn.Embedding(num_classes + 1, hidden_dim - 1) # # for indicator + # prediction heads + self.class_embed = nn.Linear(hidden_dim, num_classes) + prior_prob = 0.01 + bias_value = -math.log((1 - prior_prob) / prior_prob) + self.class_embed.bias.data = torch.ones(num_classes) * bias_value + + self.bbox_embed = MLP(hidden_dim, hidden_dim, 6, 3, activation=activation) + self.dim_embed_3d = MLP(hidden_dim, hidden_dim, 3, 2, activation=activation) + self.angle_embed = MLP(hidden_dim, hidden_dim, 24, 2, activation=activation) + self.depth_embed = MLP(hidden_dim, hidden_dim, 2, 2, activation=activation) # depth and deviation + + if init_box: + nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) + nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) + + self.query_embed = nn.Embedding(num_queries * group_num, hidden_dim * 2) + + if num_feature_levels > 1: + num_backbone_outs = len(backbone.strides) + input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = backbone.num_channels[_] + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, hidden_dim, kernel_size=1), + nn.GroupNorm(32, hidden_dim), + ), + ) + for _ in range(num_feature_levels - num_backbone_outs): + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(32, hidden_dim), + ), + ) + in_channels = hidden_dim + self.input_proj = nn.ModuleList(input_proj_list) + else: + self.input_proj = nn.ModuleList( + [ + nn.Sequential( + nn.Conv2d(backbone.num_channels[0], hidden_dim, kernel_size=1), + nn.GroupNorm(32, hidden_dim), + ), + ], + ) + + self.backbone = backbone + self.aux_loss = aux_loss + self.with_box_refine = with_box_refine + self.num_classes = num_classes + + for proj in self.input_proj: + nn.init.xavier_uniform_(proj[0].weight, gain=1) + nn.init.constant_(proj[0].bias, 0) + # if two-stage, the last class_embed and bbox_embed is for region proposal generation + num_pred = depthaware_transformer.decoder.num_layers + if with_box_refine: + self.class_embed = get_clones(self.class_embed, num_pred) + self.bbox_embed = get_clones(self.bbox_embed, num_pred) + nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0) + # implementation for iterative bounding box refinement + self.depthaware_transformer.decoder.bbox_embed = self.bbox_embed + self.dim_embed_3d = get_clones(self.dim_embed_3d, num_pred) + self.depthaware_transformer.decoder.dim_embed = self.dim_embed_3d + self.angle_embed = get_clones(self.angle_embed, num_pred) + self.depth_embed = get_clones(self.depth_embed, num_pred) + else: + nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) + self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)]) + self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)]) + self.dim_embed_3d = nn.ModuleList([self.dim_embed_3d for _ in range(num_pred)]) + self.angle_embed = nn.ModuleList([self.angle_embed for _ in range(num_pred)]) + self.depth_embed = nn.ModuleList([self.depth_embed for _ in range(num_pred)]) + self.depthaware_transformer.decoder.bbox_embed = None + + def forward( + self, + images: Tensor, + calibs: Tensor, + img_sizes: Tensor, + targets: list[dict[str, Tensor]] | None = None, + mode: str = "predict", + ) -> dict[str, Tensor]: + """Forward method of the MonoDETR model. + + Args: + images (list[Tensor]): images for each sample + calibs (Tensor): camera matrices for each sample + img_sizes (Tensor): image sizes for each sample + targets (list[dict[Tensor]): ground truth boxes and labels for each + sample + mode (str): The mode of operation. Defaults to "predict". + """ + features, pos = self.backbone(images) + + srcs = [] + masks = [] + for i, feat in enumerate(features): + src, mask = feat.decompose() + srcs.append(self.input_proj[i](src)) + masks.append(mask) + + if self.num_feature_levels > len(srcs): + _len_srcs = len(srcs) + for i in range(_len_srcs, self.num_feature_levels): + src = self.input_proj[i](features[-1].tensors) if i == _len_srcs else self.input_proj[i](srcs[-1]) + m = torch.zeros(src.shape[0], src.shape[2], src.shape[3]).to(torch.bool).to(src.device) + mask = functional.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0] + pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype) + srcs.append(src) + masks.append(mask) + pos.append(pos_l) + + query_embeds = self.query_embed.weight if self.training else self.query_embed.weight[: self.num_queries] + + pred_depth_map_logits, depth_pos_embed, weighted_depth, depth_pos_embed_ip = self.depth_predictor( + srcs, + masks[1], + pos[1], + ) + + ( + hs, + init_reference, + inter_references, + inter_references_dim, + enc_outputs_class, + enc_outputs_coord_unact, + ) = self.depthaware_transformer( + srcs, + masks, + pos, + query_embeds, + depth_pos_embed, + depth_pos_embed_ip, + ) + + outputs_coords = [] + outputs_classes = [] + outputs_3d_dims = [] + outputs_depths = [] + outputs_angles = [] + + for lvl in range(hs.shape[0]): + reference = init_reference if lvl == 0 else inter_references[lvl - 1] + reference = inverse_sigmoid(reference) + + tmp = self.bbox_embed[lvl](hs[lvl]) + if reference.shape[-1] == 6: + tmp += reference + else: + tmp[..., :2] += reference + + # 3d center + 2d box + outputs_coord = tmp.sigmoid() + outputs_coords.append(outputs_coord) + + # classes + outputs_class = self.class_embed[lvl](hs[lvl]) + outputs_classes.append(outputs_class) + + # 3D sizes + size3d = inter_references_dim[lvl] + outputs_3d_dims.append(size3d) + + # depth_geo + box2d_height_norm = outputs_coord[:, :, 4] + outputs_coord[:, :, 5] + box2d_height = torch.clamp(box2d_height_norm * img_sizes[:, 1:2], min=1.0) + depth_geo = size3d[:, :, 0] / box2d_height * calibs[:, 0, 0].unsqueeze(1) + + # depth_reg + depth_reg = self.depth_embed[lvl](hs[lvl]) + + # depth_map + outputs_center3d = ((outputs_coord[..., :2] - 0.5) * 2).unsqueeze(2).detach() + depth_map = functional.grid_sample( + weighted_depth.unsqueeze(1), + outputs_center3d, + mode="bilinear", + align_corners=True, + ).squeeze(1) + + # depth average + sigma + depth_ave = torch.cat( + [ + ((1.0 / (depth_reg[:, :, 0:1].sigmoid() + 1e-6) - 1.0) + depth_geo.unsqueeze(-1) + depth_map) / 3, + depth_reg[:, :, 1:2], + ], + -1, + ) + outputs_depths.append(depth_ave) + + # angles + outputs_angle = self.angle_embed[lvl](hs[lvl]) + outputs_angles.append(outputs_angle) + + outputs_coord = torch.stack(outputs_coords) + outputs_class = torch.stack(outputs_classes) + outputs_3d_dim = torch.stack(outputs_3d_dims) + outputs_depth = torch.stack(outputs_depths) + outputs_angle = torch.stack(outputs_angles) + + out = {"scores": outputs_class[-1], "boxes_3d": outputs_coord[-1]} + out["size_3d"] = outputs_3d_dim[-1] + out["depth"] = outputs_depth[-1] + out["heading_angle"] = outputs_angle[-1] + if mode == "export": + out["scores"] = out["scores"].sigmoid() + return out + + out["pred_depth_map_logits"] = pred_depth_map_logits + + if self.aux_loss: + out["aux_outputs"] = self._set_aux_loss( + outputs_class, + outputs_coord, + outputs_3d_dim, + outputs_angle, + outputs_depth, + ) + + if mode == "loss": + return self.criterion(outputs=out, targets=targets) + + return out + + @torch.jit.unused + def _set_aux_loss( + self, + outputs_class: Tensor, + outputs_coord: Tensor, + outputs_3d_dim: Tensor, + outputs_angle: Tensor, + outputs_depth: Tensor, + ) -> list[dict[str, Tensor]]: + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [ + {"scores": a, "boxes_3d": b, "size_3d": c, "heading_angle": d, "depth": e} + for a, b, c, d, e in zip( + outputs_class[:-1], + outputs_coord[:-1], + outputs_3d_dim[:-1], + outputs_angle[:-1], + outputs_depth[:-1], + ) + ] diff --git a/src/otx/algo/object_detection_3d/heads/__init__.py b/src/otx/algo/object_detection_3d/heads/__init__.py new file mode 100644 index 00000000000..72a504f2fbb --- /dev/null +++ b/src/otx/algo/object_detection_3d/heads/__init__.py @@ -0,0 +1,9 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""heads modules for 3d object detection.""" + +from .depth_predictor import DepthPredictor +from .depthaware_transformer import DepthAwareTransformerBuilder + +__all__ = ["DepthPredictor", "DepthAwareTransformerBuilder"] diff --git a/src/otx/algo/object_detection_3d/heads/depth_predictor.py b/src/otx/algo/object_detection_3d/heads/depth_predictor.py new file mode 100644 index 00000000000..4e5037c96d8 --- /dev/null +++ b/src/otx/algo/object_detection_3d/heads/depth_predictor.py @@ -0,0 +1,151 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""depth predictor transformer head for 3d object detection.""" + +from __future__ import annotations + +from typing import Callable + +import torch +from torch import nn +from torch.nn import functional + +from otx.algo.common.layers.transformer_layers import TransformerEncoder, TransformerEncoderLayer + + +class DepthPredictor(nn.Module): + """Depth predictor and depth encoder.""" + + def __init__( + self, + depth_num_bins: int, + depth_min: float, + depth_max: float, + hidden_dim: int, + activation: Callable[..., nn.Module] = nn.ReLU, + ) -> None: + """Initialize depth predictor and depth encoder. + + Args: + depth_num_bins (int): The number of depth bins. + depth_min (float): The minimum depth value. + depth_max (float): The maximum depth value. + hidden_dim (int): The dimension of the hidden layer. + """ + super().__init__() + self.depth_max = depth_max + + bin_size = 2 * (depth_max - depth_min) / (depth_num_bins * (1 + depth_num_bins)) + bin_indice = torch.linspace(0, depth_num_bins - 1, depth_num_bins) + bin_value = (bin_indice + 0.5).pow(2) * bin_size / 2 - bin_size / 8 + depth_min + bin_value = torch.cat([bin_value, torch.tensor([depth_max])], dim=0) + self.depth_bin_values = nn.Parameter(bin_value, requires_grad=False) + + # Create modules + d_model = hidden_dim + self.downsample = nn.Sequential( + nn.Conv2d(d_model, d_model, kernel_size=(3, 3), stride=(2, 2), padding=1), + nn.GroupNorm(32, d_model), + ) + self.proj = nn.Sequential(nn.Conv2d(d_model, d_model, kernel_size=(1, 1)), nn.GroupNorm(32, d_model)) + self.upsample = nn.Sequential(nn.Conv2d(d_model, d_model, kernel_size=(1, 1)), nn.GroupNorm(32, d_model)) + + self.depth_head = nn.Sequential( + nn.Conv2d(d_model, d_model, kernel_size=(3, 3), padding=1), + nn.GroupNorm(32, num_channels=d_model), + activation(), + nn.Conv2d(d_model, d_model, kernel_size=(3, 3), padding=1), + nn.GroupNorm(32, num_channels=d_model), + activation(), + ) + + self.depth_classifier = nn.Conv2d(d_model, depth_num_bins + 1, kernel_size=(1, 1)) + + depth_encoder_layer = TransformerEncoderLayer( + d_model, + nhead=8, + dim_feedforward=256, + dropout=0.1, + activation=activation, + normalize_before=False, + batch_first=False, + key_mask=True, + ) + + self.depth_encoder = TransformerEncoder(depth_encoder_layer, 1) + + self.depth_pos_embed = nn.Embedding(int(self.depth_max) + 1, 256) + + def forward( + self, + feature: list[torch.Tensor], + mask: torch.Tensor, + pos: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Forward pass of the DepthPredictor. + + Args: + feature (List[torch.Tensor]): The list of input feature tensors. + mask (torch.Tensor): The mask tensor. + pos (torch.Tensor): The positional tensor. + + Returns: + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: The output tensors. + - depth_logits: The depth logits tensor. + - depth_embed: The depth embedding tensor. + - weighted_depth: The weighted depth tensor. + - depth_pos_embed_ip: The interpolated depth positional embedding tensor. + """ + # foreground depth map + src_16 = self.proj(feature[1]) + src_32 = self.upsample(functional.interpolate(feature[2], size=src_16.shape[-2:], mode="bilinear")) + src_8 = self.downsample(feature[0]) + src = (src_8 + src_16 + src_32) / 3 + + src = self.depth_head(src) + depth_logits = self.depth_classifier(src) + + depth_probs = functional.softmax(depth_logits, dim=1) + weighted_depth = (depth_probs * self.depth_bin_values.reshape(1, -1, 1, 1)).sum(dim=1) + # depth embeddings with depth positional encodings + b, c, h, w = src.shape + src = src.flatten(2).permute(2, 0, 1) + mask = mask.flatten(1) + pos = pos.flatten(2).permute(2, 0, 1) + + depth_embed = self.depth_encoder(src, mask, pos) + depth_embed = depth_embed.permute(1, 2, 0).reshape(b, c, h, w) + depth_pos_embed_ip = self.interpolate_depth_embed(weighted_depth) + depth_embed = depth_embed + depth_pos_embed_ip + + return depth_logits, depth_embed, weighted_depth, depth_pos_embed_ip + + def interpolate_depth_embed(self, depth: torch.Tensor) -> torch.Tensor: + """Interpolate depth embeddings based on depth values. + + Args: + depth (torch.Tensor): The depth tensor. + + Returns: + torch.Tensor: The interpolated depth embeddings. + """ + depth = depth.clamp(min=0, max=self.depth_max) + pos = self.interpolate_1d(depth, self.depth_pos_embed) + return pos.permute(0, 3, 1, 2) + + def interpolate_1d(self, coord: torch.Tensor, embed: nn.Embedding) -> torch.Tensor: + """Interpolate 1D embeddings based on coordinates. + + Args: + coord (torch.Tensor): The coordinate tensor. + embed (nn.Embedding): The embedding module. + + Returns: + torch.Tensor: The interpolated embeddings. + """ + floor_coord = coord.floor() + delta = (coord - floor_coord).unsqueeze(-1) + floor_coord = floor_coord.long() + ceil_coord = (floor_coord + 1).clamp(max=embed.num_embeddings - 1) + return embed(floor_coord) * (1 - delta) + embed(ceil_coord) * delta diff --git a/src/otx/algo/object_detection_3d/heads/depthaware_transformer.py b/src/otx/algo/object_detection_3d/heads/depthaware_transformer.py new file mode 100644 index 00000000000..ecfe4a5008c --- /dev/null +++ b/src/otx/algo/object_detection_3d/heads/depthaware_transformer.py @@ -0,0 +1,856 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""depth aware transformer head for 3d object detection.""" +from __future__ import annotations + +import math +from typing import Any, Callable, ClassVar + +import torch +from torch import Tensor, nn +from torch.nn.init import constant_, normal_, xavier_uniform_ + +from otx.algo.detection.heads.rtdetr_decoder import MLP, MSDeformableAttention +from otx.algo.detection.utils.utils import inverse_sigmoid +from otx.algo.object_detection_3d.utils.utils import get_clones + + +def gen_sineembed_for_position(pos_tensor: Tensor) -> Tensor: + """Generate sine embeddings for position tensor. + + Args: + pos_tensor (Tensor): Position tensor of shape (n_query, bs, num_dims). + + Returns: + Tensor: Sine embeddings for position tensor of shape (n_query, bs, embedding_dim). + """ + scale = 2 * math.pi + dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device) + dim_t = 10000 ** (2 * (dim_t // 2) / 128) + x_embed = pos_tensor[:, :, 0] * scale + y_embed = pos_tensor[:, :, 1] * scale + pos_x = x_embed[:, :, None] / dim_t + pos_y = y_embed[:, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) + pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) + if pos_tensor.size(-1) == 2: + pos = torch.cat((pos_y, pos_x), dim=2) + elif pos_tensor.size(-1) == 4: + w_embed = pos_tensor[:, :, 2] * scale + pos_w = w_embed[:, :, None] / dim_t + pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2) + + h_embed = pos_tensor[:, :, 3] * scale + pos_h = h_embed[:, :, None] / dim_t + pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2) + + pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) + elif pos_tensor.size(-1) == 6: + for i in range(2, 6): # Compute sine embeds for l, r, t, b + embed = pos_tensor[:, :, i] * scale + pos_embed = embed[:, :, None] / dim_t + pos_embed = torch.stack((pos_embed[:, :, 0::2].sin(), pos_embed[:, :, 1::2].cos()), dim=3).flatten(2) + pos = pos_embed if i == 2 else torch.cat((pos, pos_embed), dim=2) + pos = torch.cat((pos_y, pos_x, pos), dim=2) + else: + msg = f"Unknown pos_tensor shape(-1):{pos_tensor.size(-1)}" + raise ValueError(msg) + return pos + + +class DepthAwareTransformer(nn.Module): + """DepthAwareTransformer module.""" + + def __init__( + self, + d_model: int = 256, + nhead: int = 8, + num_encoder_layers: int = 6, + num_decoder_layers: int = 6, + dim_feedforward: int = 1024, + dropout: float = 0.1, + activation: Callable[..., nn.Module] = nn.ReLU, + return_intermediate_dec: bool = False, + num_feature_levels: int = 4, + dec_n_points: int = 4, + enc_n_points: int = 4, + group_num: int = 11, + ) -> None: + """Initialize the DepthAwareTransformer module. + + Args: + d_model (int): The dimension of the input and output feature vectors. + nhead (int): The number of attention heads. + num_encoder_layers (int): The number of encoder layers. + num_decoder_layers (int): The number of decoder layers. + dim_feedforward (int): The dimension of the feedforward network. + dropout (float): The dropout rate. + activation (Callable[..., nn.Module]): The activation function. + return_intermediate_dec (bool): Whether to return intermediate decoder outputs. + num_feature_levels (int): The number of feature levels. + dec_n_points (int): The number of points for the decoder attention. + enc_n_points (int): The number of points for the encoder attention. + group_num (int): The number of groups for the two-stage training. + """ + super().__init__() + + self.d_model = d_model + self.nhead = nhead + self.group_num = group_num + + encoder_layer = VisualEncoderLayer( + d_model, + dim_feedforward, + dropout, + activation, + num_feature_levels, + nhead, + enc_n_points, + ) + self.encoder = VisualEncoder(encoder_layer, num_encoder_layers) + + decoder_layer = DepthAwareDecoderLayer( + d_model, + dim_feedforward, + dropout, + activation, + num_feature_levels, + nhead, + dec_n_points, + group_num=group_num, + ) + self.decoder = DepthAwareDecoder( + decoder_layer, + num_decoder_layers, + return_intermediate_dec, + d_model, + activation, + ) + + self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) + self.reference_points = nn.Linear(d_model, 2) + + self._reset_parameters() + + def _reset_parameters(self) -> None: + """Reset parameters of the model.""" + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for m in self.modules(): + if isinstance(m, MSDeformableAttention): + m._reset_parameters() # noqa: SLF001 + xavier_uniform_(self.reference_points.weight.data, gain=1.0) + constant_(self.reference_points.bias.data, 0.0) + normal_(self.level_embed) + + def get_proposal_pos_embed(self, proposals: Tensor) -> Tensor: + """Generate position embeddings for proposal tensor. + + Args: + proposals (Tensor): Proposal tensor of shape (N, L, 6). + + Returns: + Tensor: Position embeddings for proposal tensor of shape (N, L, embedding_dim). + """ + num_pos_feats = 128 + temperature = 10000 + scale = 2 * math.pi + + dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device) + dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats) + # N, L, 6 + proposals = proposals.sigmoid() * scale + # N, L, 6, 128 + pos = proposals[:, :, :, None] / dim_t + # N, L, 6, 64, 2 + return torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2) + + def gen_encoder_output_proposals( + self, + memory: Tensor, + memory_padding_mask: Tensor, + spatial_shapes: list[tuple[int, int]], + ) -> tuple[Tensor, Tensor]: + """Generate encoder output and proposals. + + Args: + memory (Tensor): Memory tensor of shape (N, S, C). + memory_padding_mask (Tensor): Memory padding mask tensor of shape (N, S). + spatial_shapes (List[Tuple[int, int]]): List of spatial shapes. + + Returns: + Tuple[Tensor, Tensor]: Encoder output tensor of shape (N, S, C) and proposals tensor of shape (N, L, 6). + """ + n_, _, _ = memory.shape + proposals = [] + _cur = 0 + for lvl, (h_, w_) in enumerate(spatial_shapes): + mask_flatten_ = memory_padding_mask[:, _cur : (_cur + h_ * w_)].view(n_, h_, w_, 1) + valid_h = torch.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_w = torch.sum(~mask_flatten_[:, 0, :, 0], 1) + + grid_y, grid_x = torch.meshgrid( + torch.linspace(0, h_ - 1, h_, dtype=torch.float32, device=memory.device), + torch.linspace(0, w_ - 1, w_, dtype=torch.float32, device=memory.device), + ) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) + + scale = torch.cat([valid_w.unsqueeze(-1), valid_h.unsqueeze(-1)], 1).view(n_, 1, 1, 2) + grid = (grid.unsqueeze(0).expand(n_, -1, -1, -1) + 0.5) / scale + + lr = torch.ones_like(grid) * 0.05 * (2.0**lvl) + tb = torch.ones_like(grid) * 0.05 * (2.0**lvl) + wh = torch.cat((lr, tb), -1) + + proposal = torch.cat((grid, wh), -1).view(n_, -1, 6) + proposals.append(proposal) + _cur += h_ * w_ + output_proposals = torch.cat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) + output_proposals = torch.log(output_proposals / (1 - output_proposals)) + output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float("inf")) + output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + + output_memory = memory + output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0)) + output_memory = output_memory.masked_fill(~output_proposals_valid, float(0)) + output_memory = self.enc_output_norm(self.enc_output(output_memory)) + return output_memory, output_proposals + + def get_valid_ratio(self, mask: Tensor) -> Tensor: + """Calculate the valid ratio of the mask. + + Args: + mask (Tensor): The mask tensor. + + Returns: + Tensor: The valid ratio tensor. + """ + _, h, w = mask.shape + valid_h = torch.sum(~mask[:, :, 0], 1) + valid_w = torch.sum(~mask[:, 0, :], 1) + valid_ratio_h = valid_h.float() / h + valid_ratio_w = valid_w.float() / w + return torch.stack([valid_ratio_w, valid_ratio_h], -1) + + def forward( + self, + srcs: list[Tensor], + masks: list[Tensor], + pos_embeds: list[Tensor], + query_embed: Tensor, + depth_pos_embed: Tensor, + depth_pos_embed_ip: Tensor, + attn_mask: Tensor | None = None, + ) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor | None, Tensor | None]: + """Forward pass of the DepthAwareTransformer module. + + Args: + srcs (List[Tensor]): List of source tensors. + masks (List[Tensor]): List of mask tensors. + pos_embeds (List[Tensor]): List of position embedding tensors. + query_embed (Tensor | None): Query embedding tensor. Defaults to None. + depth_pos_embed (Tensor | None): Depth position embedding tensor. Defaults to None. + depth_pos_embed_ip (Tensor | None): Depth position embedding IP tensor. Defaults to None. + attn_mask (Tensor | None): Attention mask tensor. Defaults to None. + + Returns: + Tuple[Tensor, Tensor, Tensor, Tensor, Tensor | None, Tensor | None]: Tuple containing the output tensors. + """ + # prepare input for encoder + src_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes_list = [] + for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)): + bs, c, h, w = src.shape + spatial_shape = (h, w) + spatial_shapes_list.append(spatial_shape) + src_ = src.flatten(2).transpose(1, 2) + pos_embed_ = pos_embed.flatten(2).transpose(1, 2) + lvl_pos_embed = pos_embed_ + self.level_embed[lvl].view(1, 1, -1) + + mask_ = mask.flatten(1) + lvl_pos_embed_flatten.append(lvl_pos_embed) + src_flatten.append(src_) + mask_flatten.append(mask_) + + src_flatten = torch.cat(src_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=srcs[0].device) + level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) + + # encoder + memory = self.encoder( + src_flatten, + spatial_shapes, + level_start_index, + valid_ratios, + lvl_pos_embed_flatten, + mask_flatten, + ) + # enc_intermediate_output, enc_intermediate_refpoints = None + # prepare input for decoder + bs, _, c = memory.shape + query_embed, tgt = torch.split(query_embed, c, dim=1) + query_embed = query_embed.unsqueeze(0).expand(bs, -1, -1) + tgt = tgt.unsqueeze(0).expand(bs, -1, -1) + reference_points = self.reference_points(query_embed).sigmoid() + init_reference_out = reference_points + + depth_pos_embed = depth_pos_embed.flatten(2).permute(2, 0, 1) + depth_pos_embed_ip = depth_pos_embed_ip.flatten(2).permute(2, 0, 1) + mask_depth = masks[1].flatten(1) + + # decoder + # ipdb.set_trace() + hs, inter_references, inter_references_dim = self.decoder( + tgt, # .transpose(1,0), for DINO + reference_points, + memory, + spatial_shapes, + level_start_index, + valid_ratios, + query_embed, # ,INFo + mask_flatten, + depth_pos_embed, + mask_depth, + bs=bs, + depth_pos_embed_ip=depth_pos_embed_ip, + pos_embeds=pos_embeds, + attn_mask=attn_mask, + ) + + inter_references_out = inter_references + inter_references_out_dim = inter_references_dim + return hs, init_reference_out, inter_references_out, inter_references_out_dim, None, None + + +class VisualEncoderLayer(nn.Module): + """VisualEncoderLayer module.""" + + def __init__( + self, + d_model: int = 256, + d_ffn: int = 1024, + dropout: float = 0.1, + activation: Callable[..., nn.Module] = nn.ReLU, + n_levels: int = 4, + n_heads: int = 8, + n_points: int = 4, + ) -> None: + """Initialize the DepthAwareDecoderLayer. + + Args: + d_model (int): The input and output dimension of the layer. Defaults to 256. + d_ffn (int): The hidden dimension of the feed-forward network. Defaults to 1024. + dropout (float): The dropout rate. Defaults to 0.1. + activation (Callable[..., nn.Module]): The activation function. Defaults to nn.ReLU. + n_levels (int): The number of feature levels. Defaults to 4. + n_heads (int): The number of attention heads. Defaults to 8. + n_points (int): The number of sampling points for the MSDeformableAttention. Defaults to 4. + """ + super().__init__() + + # self attention + self.self_attn = MSDeformableAttention(d_model, n_heads, n_levels, n_points) + self.dropout1 = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm(d_model) + + # ffn + self.linear1 = nn.Linear(d_model, d_ffn) + self.activation = activation() + self.dropout2 = nn.Dropout(dropout) + self.linear2 = nn.Linear(d_ffn, d_model) + self.dropout3 = nn.Dropout(dropout) + self.norm2 = nn.LayerNorm(d_model) + + @staticmethod + def with_pos_embed(tensor: Tensor, pos: Tensor | None) -> Tensor: + """Add position embedding to the input tensor. + + Args: + tensor (Tensor): The input tensor. + pos (Tensor | None): The position embedding tensor. Defaults to None. + + Returns: + Tensor: The tensor with position embedding added. + """ + return tensor if pos is None else tensor + pos + + def forward_ffn(self, src: Tensor) -> Tensor: + """Forward pass of the ffn. + + Args: + src (Tensor): The input tensor. + + Returns: + Tensor: The output tensor. + """ + src2 = self.linear2(self.dropout2(self.activation(self.linear1(src)))) + src = src + self.dropout3(src2) + return self.norm2(src) + + def forward( + self, + src: Tensor, + pos: Tensor, + reference_points: Tensor, + spatial_shapes: list[tuple[int, int]], + level_start_index: Tensor, + padding_mask: Tensor | None = None, + ) -> Tensor: + """Forward pass of the VisualEncoderLayer. + + Args: + src (Tensor): The input tensor. + pos (Tensor): The position embedding tensor. + reference_points (Tensor): The reference points tensor. + spatial_shapes (List[Tuple[int, int]]): The list of spatial shapes. + level_start_index (Tensor): The level start index tensor. + padding_mask (Optional[Tensor]): The padding mask tensor. Defaults to None. + + Returns: + Tensor: The output tensor. + """ + # self attention + src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, padding_mask) + src = src + self.dropout1(src2) + src = self.norm1(src) + + # ffn + return self.forward_ffn(src) + + +class VisualEncoder(nn.Module): + """VisualEncoder module.""" + + def __init__(self, encoder_layer: nn.Module, num_layers: int): + """Initialize the DepthAwareDecoder. + + Args: + encoder_layer (nn.Module): The encoder layer module. + num_layers (int): The number of layers. + """ + super().__init__() + self.layers = get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + + @staticmethod + def get_reference_points( + spatial_shapes: list[tuple[int, int]], + valid_ratios: Tensor, + device: torch.device, + ) -> Tensor: + """Generate reference points for each spatial level. + + Args: + spatial_shapes (List[Tuple[int, int]]): The list of spatial shapes. + valid_ratios (Tensor): The tensor of valid ratios. + device (torch.device): The device to use. + + Returns: + Tensor: The tensor of reference points. + """ + reference_points_list = [] + for lvl, (h_, w_) in enumerate(spatial_shapes): + ref_y, ref_x = torch.meshgrid( + torch.linspace(0.5, h_ - 0.5, h_, dtype=torch.float32, device=device), + torch.linspace(0.5, w_ - 0.5, w_, dtype=torch.float32, device=device), + ) + ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * h_) + ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * w_) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + return reference_points[:, :, None] * valid_ratios[:, None] + + def forward( + self, + src: Tensor, + spatial_shapes: list[tuple[int, int]], + level_start_index: Tensor, + valid_ratios: Tensor, + pos: Tensor | None = None, + padding_mask: Tensor | None = None, + ref_token_index: int | None = None, + ref_token_coord: Tensor | None = None, + ) -> Tensor: + """Forward pass of the VisualEncoder module. + + Args: + src (Tensor): The input tensor. + spatial_shapes (List[Tuple[int, int]]): The list of spatial shapes. + level_start_index (Tensor): The level start index tensor. + valid_ratios (Tensor): The tensor of valid ratios. + pos (Tensor | None): The position embedding tensor. Defaults to None. + padding_mask (Tensor | None): The padding mask tensor. Defaults to None. + ref_token_index (int | None): The reference token index. Defaults to None. + ref_token_coord (Tensor | None): The reference token coordinates. Defaults to None. + + Returns: + Tensor: The output tensor. + """ + output = src + reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device) + for _, layer in enumerate(self.layers): + output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask) + + return output + + +class DepthAwareDecoderLayer(nn.Module): + """DepthAwareDecoderLayer module.""" + + def __init__( + self, + d_model: int = 256, + d_ffn: int = 1024, + dropout: float = 0.1, + activation: Callable[..., nn.Module] = nn.ReLU, + n_levels: int = 4, + n_heads: int = 8, + n_points: int = 4, + group_num: int = 1, + ) -> None: + """Initialize the DepthAwareDecoderLayer. + + Args: + d_model (int): The input and output dimension of the layer. Defaults to 256. + d_ffn (int): The hidden dimension of the feed-forward network. Defaults to 1024. + dropout (float): The dropout rate. Defaults to 0.1. + activation (Callable[..., nn.Module]): The activation function. Defaults to nn.ReLU. + n_levels (int): The number of feature levels. Defaults to 4. + n_heads (int): The number of attention heads. Defaults to 8. + n_points (int): The number of sampling points for the MSDeformableAttention. Defaults to 4. + group_num (int): The number of groups for training. Defaults to 1. + """ + super().__init__() + + # cross attention + self.cross_attn = MSDeformableAttention(d_model, n_heads, n_levels, n_points) + self.dropout1 = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm(d_model) + + # depth cross attention + self.cross_attn_depth = nn.MultiheadAttention(d_model, n_heads, dropout=dropout) + self.dropout_depth = nn.Dropout(dropout) + self.norm_depth = nn.LayerNorm(d_model) + + # self attention + self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout) + self.dropout2 = nn.Dropout(dropout) + self.norm2 = nn.LayerNorm(d_model) + + # ffn + self.linear1 = nn.Linear(d_model, d_ffn) + self.activation = activation() + self.dropout3 = nn.Dropout(dropout) + self.linear2 = nn.Linear(d_ffn, d_model) + self.dropout4 = nn.Dropout(dropout) + self.norm3 = nn.LayerNorm(d_model) + + self.group_num = group_num + + # Decoder Self-Attention + self.sa_qcontent_proj = nn.Linear(d_model, d_model) + self.sa_qpos_proj = nn.Linear(d_model, d_model) + self.sa_kcontent_proj = nn.Linear(d_model, d_model) + self.sa_kpos_proj = nn.Linear(d_model, d_model) + self.sa_v_proj = nn.Linear(d_model, d_model) + self.nhead = n_heads + + @staticmethod + def with_pos_embed(tensor: Tensor, pos: Tensor | None) -> Tensor: + """Add position embedding to the input tensor. + + Args: + tensor (Tensor): The input tensor. + pos (Tensor | None): The position embedding tensor. Defaults to None. + + Returns: + Tensor: The tensor with position embedding added. + """ + return tensor if pos is None else tensor + pos + + def forward_ffn(self, tgt: Tensor) -> Tensor: + """Forward pass of the ffn. + + Args: + tgt (Tensor): The input tensor. + + Returns: + Tensor: The output tensor. + """ + tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout4(tgt2) + return self.norm3(tgt) + + def forward( + self, + tgt: Tensor, + query_pos: Tensor, + reference_points: Tensor, + src: Tensor, + src_spatial_shapes: list[tuple[int, int]], + level_start_index: Tensor, + src_padding_mask: Tensor, + depth_pos_embed: Tensor, + mask_depth: Tensor, + bs: int, + query_sine_embed: Tensor | None = None, + is_first: bool | None = None, + depth_pos_embed_ip: Tensor | None = None, + pos_embeds: list[Tensor] | None = None, + self_attn_mask: Tensor | None = None, + query_pos_un: Tensor | None = None, + ) -> Tensor: + """Forward pass of the DepthAwareDecoder module. + + Args: + tgt (Tensor): The input tensor. + query_pos (Tensor): The query position tensor. + reference_points (Tensor): The reference points tensor. + src (Tensor): The source tensor. + src_spatial_shapes (List[Tuple[int, int]]): The list of spatial shapes. + level_start_index (Tensor): The level start index tensor. + src_padding_mask (Tensor): The source padding mask tensor. + depth_pos_embed (Tensor): The depth position embedding tensor. + mask_depth (Tensor): The depth mask tensor. + bs (int): The batch size. + query_sine_embed (Tensor | None): The query sine embedding tensor. Defaults to None. + is_first (bool | None): Whether it is the first iteration. Defaults to None. + depth_pos_embed_ip (Tensor | None): The depth position embedding tensor for the iterative process. + Defaults to None. + pos_embeds (List[Tensor] | None): The list of position embedding tensors. Defaults to None. + self_attn_mask (Tensor | None): The self-attention mask tensor. Defaults to None. + query_pos_un (Tensor | None): The unnormalized query position tensor. Defaults to None. + + Returns: + Tensor: The output tensor. + """ + # depth cross attention + tgt2 = self.cross_attn_depth( + tgt.transpose(0, 1), + depth_pos_embed, + depth_pos_embed, + key_padding_mask=mask_depth, + )[0].transpose(0, 1) + + tgt = tgt + self.dropout_depth(tgt2) + tgt = self.norm_depth(tgt) + + # self attention + q = k = self.with_pos_embed(tgt, query_pos) + + q_content = self.sa_qcontent_proj(q) + q_pos = self.sa_qpos_proj(q) + k_content = self.sa_kcontent_proj(k) + k_pos = self.sa_kpos_proj(k) + v = self.sa_v_proj(tgt) + q = q_content + q_pos + k = k_content + k_pos + + q = q.transpose(0, 1) + k = k.transpose(0, 1) + v = tgt.transpose(0, 1) + num_queries = q.shape[0] + + if self.training: + num_noise = num_queries - self.group_num * 50 + num_queries = self.group_num * 50 + q_noise = q[:num_noise].repeat(1, self.group_num, 1) + k_noise = k[:num_noise].repeat(1, self.group_num, 1) + v_noise = v[:num_noise].repeat(1, self.group_num, 1) + q = q[num_noise:] + k = k[num_noise:] + v = v[num_noise:] + q = torch.cat(q.split(num_queries // self.group_num, dim=0), dim=1) + k = torch.cat(k.split(num_queries // self.group_num, dim=0), dim=1) + v = torch.cat(v.split(num_queries // self.group_num, dim=0), dim=1) + q = torch.cat([q_noise, q], dim=0) + k = torch.cat([k_noise, k], dim=0) + v = torch.cat([v_noise, v], dim=0) + + tgt2 = self.self_attn(q, k, v)[0] + tgt2 = torch.cat(tgt2.split(bs, dim=1), dim=0).transpose(0, 1) if self.training else tgt2.transpose(0, 1) + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + + tgt2 = self.cross_attn( + self.with_pos_embed(tgt, query_pos), + reference_points, + src, + src_spatial_shapes, + src_padding_mask, + ) + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + + return self.forward_ffn(tgt) + + +class DepthAwareDecoder(nn.Module): + """DepthAwareDecoder module.""" + + def __init__( + self, + decoder_layer: nn.Module, + num_layers: int, + return_intermediate: bool, + d_model: int, + activation: Callable[..., nn.Module] = nn.ReLU, + ) -> None: + """Initialize the DepthAwareDecoder. + + Args: + decoder_layer (nn.Module): The decoder layer module. + num_layers (int): The number of layers. + return_intermediate (bool, optional): Whether to return intermediate outputs. Defaults to False. + d_model (int | None, optional): The input and output dimension of the layer. Defaults to None. + """ + super().__init__() + self.layers = get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + self.return_intermediate = return_intermediate + + self.bbox_embed = None + self.dim_embed = None + self.class_embed = None + + self.query_scale = MLP(d_model, d_model, d_model, 2, activation=activation) + self.ref_point_head = MLP(d_model, d_model, 2, 2, activation=activation) + + def forward( + self, + tgt: Tensor, + reference_points: Tensor, + src: Tensor, + src_spatial_shapes: list[tuple[int, int]], + src_level_start_index: Tensor, + src_valid_ratios: Tensor, + query_pos: Tensor | None = None, + src_padding_mask: Tensor | None = None, + depth_pos_embed: Tensor | None = None, + mask_depth: Tensor | None = None, + bs: int | None = None, + depth_pos_embed_ip: Tensor | None = None, + pos_embeds: list[Tensor] | None = None, + attn_mask: Tensor | None = None, + ) -> Tensor: + """Forward pass of the DepthAwareDecoder module. + + Args: + tgt (Tensor): The input tensor. + reference_points (Tensor): The reference points tensor. + src (Tensor): The source tensor. + src_spatial_shapes (List[Tuple[int, int]]): The list of spatial shapes. + src_level_start_index (Tensor): The level start index tensor. + src_valid_ratios (Tensor): The tensor of valid ratios. + query_pos (Tensor | None): The query position tensor. Defaults to None. + src_padding_mask (Tensor | None): The source padding mask tensor. Defaults to None. + depth_pos_embed (Tensor | None): The depth position embedding tensor. Defaults to None. + mask_depth (Tensor | None): The depth mask tensor. Defaults to None. + bs (int | None): The batch size. Defaults to None. + depth_pos_embed_ip (Tensor | None): The depth position embedding tensor for the iterative process. + Defaults to None. + pos_embeds (List[Tensor] | None): The list of position embedding tensors. Defaults to None. + attn_mask (Tensor | None): The self-attention mask tensor. Defaults to None. + + Returns: + Tensor: The output tensor. + """ + output = tgt + + intermediate = [] + intermediate_reference_points = [] + intermediate_reference_dims = [] + bs = src.shape[0] + + for lid, layer in enumerate(self.layers): + if reference_points.shape[-1] == 6: + reference_points_input = ( + reference_points[:, :, None] + * torch.cat([src_valid_ratios, src_valid_ratios, src_valid_ratios], -1)[:, None] + ) + else: + if reference_points.shape[-1] != 2: + msg = f"Wrong reference_points shape[-1]:{reference_points.shape[-1]}" + raise ValueError(msg) + + reference_points_input = reference_points[:, :, None] * src_valid_ratios[:, None] + + ###conditional + output = layer( + output, + query_pos, + reference_points_input, + src, + src_spatial_shapes, + src_level_start_index, + src_padding_mask, + depth_pos_embed, + mask_depth, + bs, + query_sine_embed=None, + is_first=(lid == 0), + depth_pos_embed_ip=depth_pos_embed_ip, + pos_embeds=pos_embeds, + self_attn_mask=attn_mask, + query_pos_un=None, + ) + + # implementation for iterative bounding box refinement + if self.bbox_embed is not None: + tmp = self.bbox_embed[lid](output) + if reference_points.shape[-1] == 6: + new_reference_points = tmp + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + else: + new_reference_points = tmp + new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + reference_points = new_reference_points.detach() + + reference_dims: Tensor + if self.dim_embed is not None: + reference_dims = self.dim_embed[lid](output) + + if self.return_intermediate: + intermediate.append(output) + intermediate_reference_points.append(reference_points) + intermediate_reference_dims.append(reference_dims) + + if self.return_intermediate: + return torch.stack(intermediate), torch.stack(intermediate_reference_points), torch.stack( + intermediate_reference_dims, + ) + + return output, reference_points + + +class DepthAwareTransformerBuilder: + """DepthAwareTransformerBuilder.""" + + CFG: ClassVar[dict[str, Any]] = { + "monodetr_50": { + "d_model": 256, + "dropout": 0.1, + "nhead": 8, + "dim_feedforward": 256, + "num_encoder_layers": 3, + "num_decoder_layers": 3, + "return_intermediate_dec": True, + "num_feature_levels": 4, + "dec_n_points": 4, + "enc_n_points": 4, + }, + } + + def __new__(cls, model_name: str) -> DepthAwareTransformer: + """Create the DepthAwareTransformer.""" + return DepthAwareTransformer(**cls.CFG[model_name]) diff --git a/src/otx/algo/object_detection_3d/losses/__init__.py b/src/otx/algo/object_detection_3d/losses/__init__.py new file mode 100644 index 00000000000..fb407a5a3ad --- /dev/null +++ b/src/otx/algo/object_detection_3d/losses/__init__.py @@ -0,0 +1,8 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Loss functions for 3d object detection.""" + +from .monodetr_loss import MonoDETRCriterion + +__all__ = ["MonoDETRCriterion"] diff --git a/src/otx/algo/object_detection_3d/losses/ddn_loss.py b/src/otx/algo/object_detection_3d/losses/ddn_loss.py new file mode 100644 index 00000000000..e3a4238be03 --- /dev/null +++ b/src/otx/algo/object_detection_3d/losses/ddn_loss.py @@ -0,0 +1,251 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""ddn loss for MonoDETR model.""" +from __future__ import annotations + +import math + +import torch +from torch import nn + +from otx.algo.common.losses.focal_loss import FocalLoss + + +def compute_fg_mask( + gt_boxes2d: torch.Tensor, + shape: tuple[int, int], + num_gt_per_img: int, + downsample_factor: int = 1, + device: torch.device | None = None, +) -> torch.Tensor: + """Compute foreground mask for images. + + Args: + gt_boxes2d [torch.Tensor(B, N, 4)]: 2D box labels + shape [Tuple[int, int]]: Foreground mask desired shape + downsample_factor [int]: Downsample factor for image + device [torch.device]: Foreground mask desired device + + Returns: + fg_mask [torch.Tensor(shape)]: Foreground mask + """ + if device is None: + device = torch.device("cpu") + fg_mask = torch.zeros(shape, dtype=torch.bool, device=device) + + # Set box corners + gt_boxes2d /= downsample_factor + gt_boxes2d[:, :2] = torch.floor(gt_boxes2d[:, :2]) + gt_boxes2d[:, 2:] = torch.ceil(gt_boxes2d[:, 2:]) + gt_boxes2d = gt_boxes2d.long() + + # Set all values within each box to True + gt_boxes2d = gt_boxes2d.split(num_gt_per_img, dim=0) + b = len(gt_boxes2d) + for i in range(b): + for n in range(gt_boxes2d[i].shape[0]): + u1, v1, u2, v2 = gt_boxes2d[i][n] + fg_mask[i, v1:v2, u1:u2] = True + + return fg_mask + + +class Balancer(nn.Module): + """Fixed foreground/background loss balancer.""" + + def __init__(self, fg_weight: float, bg_weight: float, downsample_factor: int = 1): + """Initialize fixed foreground/background loss balancer. + + Args: + fg_weight [float]: Foreground loss weight + bg_weight [float]: Background loss weight + downsample_factor [int]: Depth map downsample factor + """ + super().__init__() + self.fg_weight = fg_weight + self.bg_weight = bg_weight + self.downsample_factor = downsample_factor + + def forward( + self, + loss: torch.Tensor, + gt_boxes2d: torch.Tensor, + num_gt_per_img: int, + ) -> tuple[torch.Tensor, dict[float, float]]: + """Forward pass. + + Args: + loss [torch.Tensor(B, H, W)]: Pixel-wise loss + gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing + + Returns: + loss [torch.Tensor(1)]: Total loss after foreground/background balancing + tb_dict [dict[float]]: All losses to log in tensorboard + """ + # Compute masks + fg_mask = compute_fg_mask( + gt_boxes2d=gt_boxes2d, + shape=loss.shape, + num_gt_per_img=num_gt_per_img, + downsample_factor=self.downsample_factor, + device=loss.device, + ) + bg_mask = ~fg_mask + + # Compute balancing weights + weights = self.fg_weight * fg_mask + self.bg_weight * bg_mask + num_pixels = fg_mask.sum() + bg_mask.sum() + + # Compute losses + loss *= weights + fg_loss = loss[fg_mask].sum() / num_pixels + bg_loss = loss[bg_mask].sum() / num_pixels + + # return total loss + return fg_loss + bg_loss + + +class DDNLoss(nn.Module): + """DDNLoss module for computing the loss for MonoDETR model.""" + + def __init__( + self, + alpha: float = 0.25, + gamma: float = 2.0, + fg_weight: float = 13, + bg_weight: float = 1, + downsample_factor: int = 1, + ) -> None: + """Initializes DDNLoss module. + + Args: + weight [float]: Loss function weight + alpha [float]: Alpha value for Focal Loss + gamma [float]: Gamma value for Focal Loss + disc_cfg [dict]: Depth discretiziation configuration + fg_weight [float]: Foreground loss weight + bg_weight [float]: Background loss weight + downsample_factor [int]: Depth map downsample factor + """ + super().__init__() + self.balancer = Balancer(downsample_factor=downsample_factor, fg_weight=fg_weight, bg_weight=bg_weight) + + # Set loss function + self.alpha = alpha + self.gamma = gamma + self.loss_func = FocalLoss(alpha=self.alpha, gamma=self.gamma, reduction="none") + + def build_target_depth_from_3dcenter( + self, + depth_logits: torch.Tensor, + gt_boxes2d: torch.Tensor, + gt_center_depth: torch.Tensor, + num_gt_per_img: int, + ) -> torch.Tensor: + """Builds target depth map from 3D center depth. + + Args: + depth_logits: torch.Tensor(B, D+1, H, W)]: Predicted depth logits + gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing + gt_center_depth [torch.Tensor(B, N)]: 3D center depth + num_gt_per_img: [int]: Number of ground truth boxes per image + """ + b, _, h, w = depth_logits.shape + depth_maps = torch.zeros((b, h, w), device=depth_logits.device, dtype=depth_logits.dtype) + + # Set box corners + gt_boxes2d[:, :2] = torch.floor(gt_boxes2d[:, :2]) + gt_boxes2d[:, 2:] = torch.ceil(gt_boxes2d[:, 2:]) + gt_boxes2d = gt_boxes2d.long() + + # Set all values within each box to True + gt_boxes2d = gt_boxes2d.split(num_gt_per_img, dim=0) + gt_center_depth = gt_center_depth.split(num_gt_per_img, dim=0) + b = len(gt_boxes2d) + for i in range(b): + center_depth_per_batch = gt_center_depth[i] + center_depth_per_batch, sorted_idx = torch.sort(center_depth_per_batch, dim=0, descending=True) + gt_boxes_per_batch = gt_boxes2d[i][sorted_idx] + for n in range(gt_boxes_per_batch.shape[0]): + u1, v1, u2, v2 = gt_boxes_per_batch[n] + depth_maps[i, v1:v2, u1:u2] = center_depth_per_batch[n] + + return depth_maps + + def bin_depths( + self, + depth_map: torch.Tensor, + mode: str = "LID", + depth_min: float = 1e-3, + depth_max: float = 60, + num_bins: int = 80, + target: bool = False, + ) -> torch.Tensor: + """Converts depth map into bin indices. + + Args: + depth_map [torch.Tensor(H, W)]: Depth Map + mode [string]: Discretiziation mode (See https://arxiv.org/pdf/2005.13423.pdf for more details) + UD: Uniform discretiziation + LID: Linear increasing discretiziation + SID: Spacing increasing discretiziation + depth_min [float]: Minimum depth value + depth_max [float]: Maximum depth value + num_bins [int]: Number of depth bins + target [bool]: Whether the depth bins indices will be used for a target tensor in loss comparison + + Returns: + indices [torch.Tensor(H, W)]: Depth bin indices + """ + if mode == "UD": + bin_size = (depth_max - depth_min) / num_bins + indices = (depth_map - depth_min) / bin_size + elif mode == "LID": + bin_size = 2 * (depth_max - depth_min) / (num_bins * (1 + num_bins)) + indices = -0.5 + 0.5 * torch.sqrt(1 + 8 * (depth_map - depth_min) / bin_size) + elif mode == "SID": + indices = ( + num_bins + * (torch.log(1 + depth_map) - math.log(1 + depth_min)) + / (math.log(1 + depth_max) - math.log(1 + depth_min)) + ) + else: + raise NotImplementedError + + if target: + # Remove indicies outside of bounds + mask = (indices < 0) | (indices > num_bins) | (~torch.isfinite(indices)) + indices[mask] = num_bins + + # Convert to integer + indices = indices.type(torch.int64) + + return indices + + def forward( + self, + depth_logits: torch.Tensor, + gt_boxes2d: torch.Tensor, + num_gt_per_img: int, + gt_center_depth: torch.Tensor, + ) -> torch.Tensor: + """Gets depth_map loss. + + Args: + depth_logits: torch.Tensor(B, D+1, H, W)]: Predicted depth logits + gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing + num_gt_per_img: [int]: Number of ground truth boxes per image + gt_center_depth: [torch.Tensor(B, N)]: 3D center depth + + Returns: + loss [torch.Tensor(1)]: Depth classification network loss + """ + # Bin depth map to create target + depth_maps = self.build_target_depth_from_3dcenter(depth_logits, gt_boxes2d, gt_center_depth, num_gt_per_img) + depth_target = self.bin_depths(depth_maps, target=True) + # Compute loss + loss = self.loss_func(depth_logits, depth_target) + # Compute foreground/background balancing + + return self.balancer(loss=loss, gt_boxes2d=gt_boxes2d, num_gt_per_img=num_gt_per_img) diff --git a/src/otx/algo/object_detection_3d/losses/monodetr_loss.py b/src/otx/algo/object_detection_3d/losses/monodetr_loss.py new file mode 100644 index 00000000000..ebc98d45a51 --- /dev/null +++ b/src/otx/algo/object_detection_3d/losses/monodetr_loss.py @@ -0,0 +1,247 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""main loss for MonoDETR model.""" +from __future__ import annotations + +from typing import TYPE_CHECKING, Callable + +import torch +from torch import nn +from torch.nn import functional +from torchvision.ops import box_convert + +from otx.algo.common.losses.focal_loss import py_sigmoid_focal_loss +from otx.algo.common.losses.iou_loss import giou_loss +from otx.algo.object_detection_3d.matchers.matcher_3d import HungarianMatcher3D +from otx.algo.object_detection_3d.utils.utils import box_cxcylrtb_to_xyxy + +from .ddn_loss import DDNLoss + +if TYPE_CHECKING: + from torch import Tensor + + +class MonoDETRCriterion(nn.Module): + """This class computes the loss for MonoDETR.""" + + def __init__(self, num_classes: int, weight_dict: dict, focal_alpha: float, group_num: int = 11) -> None: + """MonoDETRCriterion. + + Args: + num_classes: number of object categories, omitting the special no-object category + matcher: module able to compute a matching between targets and proposals + weight_dict: dict containing as key the names of the losses and as values their relative weight. + focal_alpha: alpha in Focal Loss + group_num: number of groups for data parallelism + """ + super().__init__() + self.num_classes = num_classes + self.matcher = HungarianMatcher3D(cost_class=2, cost_3dcenter=10, cost_bbox=5, cost_giou=2) + self.weight_dict = weight_dict + for name in self.loss_map: + if name not in self.weight_dict: + self.weight_dict[name] = 1 + self.focal_alpha = focal_alpha + self.ddn_loss = DDNLoss() # for depth map + self.group_num = group_num + + def loss_labels(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: + """Classification loss.""" + src_logits = outputs["scores"] + + idx = self._get_src_permutation_idx(indices) + target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full(src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device) + + target_classes[idx] = target_classes_o.squeeze().long() + + target_classes_onehot = torch.zeros( + [src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1], + dtype=src_logits.dtype, + layout=src_logits.layout, + device=src_logits.device, + ) + target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1) + + target_classes_onehot = target_classes_onehot[:, :, :-1] + loss_ce = py_sigmoid_focal_loss( + pred=src_logits, + target=target_classes_onehot, + avg_factor=num_boxes, + alpha=self.focal_alpha, + reduction="mean", + ) + + return {"loss_ce": loss_ce} + + def loss_3dcenter(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: + """Compute the loss for the 3D center prediction.""" + idx = self._get_src_permutation_idx(indices) + src_3dcenter = outputs["boxes_3d"][:, :, 0:2][idx] + target_3dcenter = torch.cat([t["boxes_3d"][:, 0:2][i] for t, (_, i) in zip(targets, indices)], dim=0) + + loss_3dcenter = functional.l1_loss(src_3dcenter, target_3dcenter, reduction="none") + return {"loss_center": loss_3dcenter.sum() / num_boxes} + + def loss_boxes(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: + """Compute l1 loss.""" + idx = self._get_src_permutation_idx(indices) + src_2dboxes = outputs["boxes_3d"][:, :, 2:6][idx] + target_2dboxes = torch.cat([t["boxes_3d"][:, 2:6][i] for t, (_, i) in zip(targets, indices)], dim=0) + + # l1 + loss_bbox = functional.l1_loss(src_2dboxes, target_2dboxes, reduction="none") + return {"loss_bbox": loss_bbox.sum() / num_boxes} + + def loss_giou(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: + """Compute the GIoU loss.""" + # giou + idx = self._get_src_permutation_idx(indices) + src_boxes = outputs["boxes_3d"][idx] + target_boxes = torch.cat([t["boxes_3d"][i] for t, (_, i) in zip(targets, indices)], dim=0) + loss_giou = giou_loss(box_cxcylrtb_to_xyxy(src_boxes), box_cxcylrtb_to_xyxy(target_boxes)) + return {"loss_giou": loss_giou} + + def loss_depths(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: + """Compute the loss for the depth prediction.""" + idx = self._get_src_permutation_idx(indices) + + src_depths = outputs["depth"][idx] + target_depths = torch.cat([t["depth"][i] for t, (_, i) in zip(targets, indices)], dim=0).squeeze() + + depth_input, depth_log_variance = src_depths[:, 0], src_depths[:, 1] + depth_loss = 1.4142 * torch.exp(-depth_log_variance) * torch.abs(depth_input - target_depths) + torch.abs( + depth_log_variance, + ) + return {"loss_depth": depth_loss.sum() / num_boxes} + + def loss_dims(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: + """Compute the loss for the dimension prediction.""" + idx = self._get_src_permutation_idx(indices) + src_dims = outputs["size_3d"][idx] + target_dims = torch.cat([t["size_3d"][i] for t, (_, i) in zip(targets, indices)], dim=0) + + dimension = target_dims.clone().detach() + dim_loss = torch.abs(src_dims - target_dims) + dim_loss /= dimension + with torch.no_grad(): + compensation_weight = functional.l1_loss(src_dims, target_dims) / dim_loss.mean() + dim_loss *= compensation_weight + return {"loss_dim": dim_loss.sum() / num_boxes} + + def loss_angles(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: + """Compute the loss for the angle prediction.""" + idx = self._get_src_permutation_idx(indices) + heading_input = outputs["heading_angle"][idx] + target_heading_angle = torch.cat([t["heading_angle"][i] for t, (_, i) in zip(targets, indices)], dim=0) + heading_target_cls = target_heading_angle[:, 0].view(-1).long() + heading_target_res = target_heading_angle[:, 1].view(-1) + + heading_input = heading_input.view(-1, 24) + + # classification loss + heading_input_cls = heading_input[:, 0:12] + cls_loss = functional.cross_entropy(heading_input_cls, heading_target_cls, reduction="none") + + # regression loss + heading_input_res = heading_input[:, 12:24] + cls_onehot = ( + torch.zeros(heading_target_cls.shape[0], 12) + .to(device=heading_input.device) + .scatter_(dim=1, index=heading_target_cls.view(-1, 1), value=1) + ) + heading_input_res = torch.sum(heading_input_res * cls_onehot, 1) + reg_loss = functional.l1_loss(heading_input_res, heading_target_res, reduction="none") + + angle_loss = cls_loss + reg_loss + return {"loss_angle": angle_loss.sum() / num_boxes} + + def loss_depth_map(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: + """Depth map loss.""" + depth_map_logits = outputs["pred_depth_map_logits"] + + num_gt_per_img = [len(t["boxes"]) for t in targets] + gt_boxes2d = torch.cat([t["boxes"] for t in targets], dim=0) * torch.tensor( + [80, 24, 80, 24], + device=depth_map_logits.device, + ) + gt_boxes2d = box_convert(gt_boxes2d, "cxcywh", "xyxy") + gt_center_depth = torch.cat([t["depth"] for t in targets], dim=0).squeeze(dim=1) + return {"loss_depth_map": self.ddn_loss(depth_map_logits, gt_boxes2d, num_gt_per_img, gt_center_depth)} + + def _get_src_permutation_idx( + self, + indices: list[tuple[torch.Tensor, torch.Tensor]], + ) -> tuple[torch.Tensor, torch.Tensor]: + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) + src_idx = torch.cat([src for (src, _) in indices]) + return batch_idx, src_idx + + def _get_tgt_permutation_idx( + self, + indices: list[tuple[torch.Tensor, torch.Tensor]], + ) -> tuple[torch.Tensor, torch.Tensor]: + # permute targets following indices + batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) + tgt_idx = torch.cat([tgt for (_, tgt) in indices]) + return batch_idx, tgt_idx + + @property + def loss_map(self) -> dict[str, Callable]: + """Return the loss map.""" + return { + "loss_ce": self.loss_labels, + "loss_bbox": self.loss_boxes, + "loss_giou": self.loss_giou, + "loss_depth": self.loss_depths, + "loss_dim": self.loss_dims, + "loss_angle": self.loss_angles, + "loss_center": self.loss_3dcenter, + "loss_depth_map": self.loss_depth_map, + } + + def forward( + self, + outputs: dict[str, torch.Tensor], + targets: list[dict[str, torch.Tensor]], + ) -> dict[str, torch.Tensor]: + """This performs the loss computation. + + Args: + outputs: dict of tensors, see the output specification of the model for the format + targets: list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc + """ + outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"} + group_num = self.group_num if self.training else 1 + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets, group_num=group_num) + + # Compute the average number of target boxes across all nodes, for normalization purposes + num_boxes_int = sum([len(t["labels"]) for t in targets]) * group_num + num_boxes = torch.as_tensor([num_boxes_int], dtype=torch.float, device=next(iter(outputs.values())).device) + num_boxes = torch.clamp(num_boxes, min=1) + + # Compute all the requested losses + losses = {} + for loss in self.loss_map.values(): + losses.update(loss(outputs, targets, indices, num_boxes)) + + losses = {k: losses[k] * self.weight_dict[k] for k in losses} + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if "aux_outputs" in outputs: + for i, aux_outputs in enumerate(outputs["aux_outputs"]): + indices = self.matcher(aux_outputs, targets, group_num=group_num) + for name, loss in self.loss_map.items(): + if name == "loss_depth_map": + # Intermediate masks losses are too costly to compute, we ignore them. + continue + l_dict = loss(aux_outputs, targets, indices, num_boxes.item()) + l_dict = {k + f"_aux_{i}": v * self.weight_dict[k] for k, v in l_dict.items()} + losses.update(l_dict) + + return losses diff --git a/src/otx/algo/object_detection_3d/matchers/__init__.py b/src/otx/algo/object_detection_3d/matchers/__init__.py new file mode 100644 index 00000000000..4c217a82f7e --- /dev/null +++ b/src/otx/algo/object_detection_3d/matchers/__init__.py @@ -0,0 +1,8 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Matchers modules for 3d object detection.""" + +from .matcher_3d import HungarianMatcher3D + +__all__ = ["HungarianMatcher3D"] diff --git a/src/otx/algo/object_detection_3d/matchers/matcher_3d.py b/src/otx/algo/object_detection_3d/matchers/matcher_3d.py new file mode 100644 index 00000000000..2e6e7ac8ddf --- /dev/null +++ b/src/otx/algo/object_detection_3d/matchers/matcher_3d.py @@ -0,0 +1,119 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""HungarianMatcher3D module for 3d object detection.""" + +import numpy as np +import torch +from scipy.optimize import linear_sum_assignment +from torch import nn + +from otx.algo.common.utils.bbox_overlaps import bbox_overlaps +from otx.algo.object_detection_3d.utils.utils import box_cxcylrtb_to_xyxy + + +class HungarianMatcher3D(nn.Module): + """This class computes an assignment between the targets and the predictions of the network.""" + + def __init__( + self, + cost_class: float = 1.0, + cost_3dcenter: float = 1.0, + cost_bbox: float = 1.0, + cost_giou: float = 1.0, + ): + """Creates the matcher. + + Args: + cost_class (float): This is the relative weight of the classification error in the matching cost. + cost_3dcenter (float): This is the relative weight of the L1 error of the 3d center in the matching cost. + cost_bbox (float): This is the relative weight of the L1 error of the bbox coordinates in the matching cost. + cost_giou (float): This is the relative weight of the giou loss of the bbox in the matching cost. + """ + super().__init__() + self.cost_class = cost_class + self.cost_3dcenter = cost_3dcenter + self.cost_bbox = cost_bbox + self.cost_giou = cost_giou + + @torch.no_grad() + def forward(self, outputs: dict, targets: list, group_num: int = 11) -> list: + """Performs the matching. + + Args: + outputs: This is a dict that contains at least these entries: + "scores": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + "boxes_3d": Tensor of dim [batch_size, num_queries, 4] with the predicted 3d box coordinates + targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: + "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth + objects in the target) containing the class labels + "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates + Returns: + A list of size batch_size, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: + len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + bs, num_queries = outputs["boxes_3d"].shape[:2] + + # We flatten to compute the cost matrices in a batch + + out_prob = outputs["scores"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes] + # Also concat the target labels and boxes + tgt_ids = torch.cat([v["labels"] for v in targets]).long() + + # Compute the classification cost. + alpha = 0.25 + gamma = 2.0 + neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log()) + pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) + cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids] + + out_3dcenter = outputs["boxes_3d"][:, :, 0:2].flatten(0, 1) # [batch_size * num_queries, 4] + tgt_3dcenter = torch.cat([v["boxes_3d"][:, 0:2] for v in targets]) + + # Compute the 3dcenter cost between boxes + cost_3dcenter = torch.cdist(out_3dcenter, tgt_3dcenter, p=1) + + out_2dbbox = outputs["boxes_3d"][:, :, 2:6].flatten(0, 1) # [batch_size * num_queries, 4] + tgt_2dbbox = torch.cat([v["boxes_3d"][:, 2:6] for v in targets]) + # Compute the L1 cost between boxes + cost_bbox = torch.cdist(out_2dbbox, tgt_2dbbox, p=1) + + # Compute the giou cost betwen boxes + out_bbox = outputs["boxes_3d"].flatten(0, 1) # [batch_size * num_queries, 4] + tgt_bbox = torch.cat([v["boxes_3d"] for v in targets]) + cost_giou = -bbox_overlaps( + box_cxcylrtb_to_xyxy(out_bbox), + box_cxcylrtb_to_xyxy(tgt_bbox), + mode="giou", + ) + # Final cost matrix + c = ( + self.cost_bbox * cost_bbox + + self.cost_3dcenter * cost_3dcenter + + self.cost_class * cost_class + + self.cost_giou * cost_giou + ) + c = c.view(bs, num_queries, -1).cpu() + + sizes = [len(v["boxes"]) for v in targets] + # indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] + indices = [] + g_num_queries = num_queries // group_num + c_list = c.split(g_num_queries, dim=1) + for g_i in range(group_num): + c_g = c_list[g_i] + indices_g = [linear_sum_assignment(c[i]) for i, c in enumerate(c_g.split(sizes, -1))] + if g_i == 0: + indices = indices_g + else: + indices = [ + ( + np.concatenate([indice1[0], indice2[0] + g_num_queries * g_i]), + np.concatenate([indice1[1], indice2[1]]), + ) + for indice1, indice2 in zip(indices, indices_g) + ] + return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] diff --git a/src/otx/algo/object_detection_3d/monodetr3d.py b/src/otx/algo/object_detection_3d/monodetr3d.py new file mode 100644 index 00000000000..2ea42e52f95 --- /dev/null +++ b/src/otx/algo/object_detection_3d/monodetr3d.py @@ -0,0 +1,249 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""MonoDetr model implementations.""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import torch +from torch import Tensor +from torchvision.ops import box_convert + +from otx.algo.object_detection_3d.backbones.monodetr_resnet import BackboneBuilder +from otx.algo.object_detection_3d.detectors.monodetr import MonoDETR +from otx.algo.object_detection_3d.heads.depth_predictor import DepthPredictor +from otx.algo.object_detection_3d.heads.depthaware_transformer import DepthAwareTransformerBuilder +from otx.algo.object_detection_3d.losses import MonoDETRCriterion +from otx.algo.object_detection_3d.utils.utils import box_cxcylrtb_to_xyxy +from otx.core.data.entity.base import OTXBatchLossEntity +from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity, Det3DBatchPredEntity +from otx.core.exporter.base import OTXModelExporter +from otx.core.exporter.detection_3d import OTXObjectDetection3DExporter +from otx.core.model.detection_3d import OTX3DDetectionModel + + +class MonoDETR3D(OTX3DDetectionModel): + """OTX Detection model class for MonoDETR3D.""" + + mean: tuple[float, float, float] = (0.485, 0.456, 0.406) + std: tuple[float, float, float] = (0.229, 0.224, 0.225) + input_size: tuple[int, int] = (384, 1280) # HxW + load_from: str | None = None + + def _build_model(self, num_classes: int) -> MonoDETR: + # backbone + backbone = BackboneBuilder(self.model_name) + # transformer + depthaware_transformer = DepthAwareTransformerBuilder(self.model_name) + # depth prediction module + depth_predictor = DepthPredictor(depth_num_bins=80, depth_min=1e-3, depth_max=60.0, hidden_dim=256) + # criterion + loss_weight_dict = { + "loss_ce": 2, + "loss_bbox": 5, + "loss_giou": 2, + "loss_center": 10, + } + criterion = MonoDETRCriterion(num_classes=num_classes, focal_alpha=0.25, weight_dict=loss_weight_dict) + + return MonoDETR( + backbone, + depthaware_transformer, + depth_predictor, + num_classes=num_classes, + criterion=criterion, + num_queries=50, + aux_loss=True, + num_feature_levels=4, + with_box_refine=True, + init_box=False, + ) + + def _customize_inputs( + self, + entity: Det3DBatchDataEntity, + ) -> dict[str, Any]: + # prepare bboxes for the model + targets_list = [] + img_sizes = torch.from_numpy(np.array([img_info.ori_shape for img_info in entity.imgs_info])).to( + device=entity.images.device, + ) + key_list = ["labels", "boxes", "depth", "size_3d", "heading_angle", "boxes_3d"] + for bz in range(len(entity.imgs_info)): + target_dict = {} + for key in key_list: + target_dict[key] = getattr(entity, key)[bz] + targets_list.append(target_dict) + + return { + "images": entity.images, + "calibs": torch.cat([p2.unsqueeze(0) for p2 in entity.calib_matrix], dim=0), + "targets": targets_list, + "img_sizes": img_sizes, + "mode": "loss" if self.training else "predict", + } + + def _customize_outputs( + self, + outputs: dict[str, torch.Tensor], + inputs: Det3DBatchDataEntity, + ) -> Det3DBatchPredEntity | OTXBatchLossEntity: + if self.training: + if not isinstance(outputs, dict): + raise TypeError(outputs) + + losses = OTXBatchLossEntity() + for k, v in outputs.items(): + if isinstance(v, list): + losses[k] = sum(v) + elif isinstance(v, Tensor): + losses[k] = v + else: + msg = "Loss output should be list or torch.tensor but got {type(v)}" + raise TypeError(msg) + return losses + + labels, scores, size_3d, heading_angle, boxes_3d, depth = self.extract_dets_from_outputs(outputs) + # bbox 2d decoding + boxes_2d = box_cxcylrtb_to_xyxy(boxes_3d) + xywh_2d = box_convert(boxes_2d, "xyxy", "cxcywh") + # size 2d decoding + size_2d = xywh_2d[:, :, 2:4] + + return Det3DBatchPredEntity( + batch_size=inputs.batch_size, + images=inputs.images, + imgs_info=inputs.imgs_info, + calib_matrix=inputs.calib_matrix, + boxes=boxes_2d, + labels=labels, + boxes_3d=boxes_3d, + size_2d=size_2d, + size_3d=size_3d, + depth=depth, + heading_angle=heading_angle, + scores=scores, + original_kitti_format=[None], + ) + + def configure_optimizers(self) -> tuple[list[torch.optim.Optimizer], list[dict[str, Any]]]: + """Configure an optimizer and learning-rate schedulers. + + Configure an optimizer and learning-rate schedulers + from the given optimizer and scheduler or scheduler list callable in the constructor. + Generally, there is two lr schedulers. One is for a linear warmup scheduler and + the other is the main scheduler working after the warmup period. + + Returns: + Two list. The former is a list that contains an optimizer + The latter is a list of lr scheduler configs which has a dictionary format. + """ + param_groups = self._apply_no_bias_decay() + optimizer = self.optimizer_callable(param_groups) + schedulers = self.scheduler_callable(optimizer) + + def ensure_list(item: Any) -> list: # noqa: ANN401 + return item if isinstance(item, list) else [item] + + lr_scheduler_configs = [] + for scheduler in ensure_list(schedulers): + lr_scheduler_config = {"scheduler": scheduler} + if hasattr(scheduler, "interval"): + lr_scheduler_config["interval"] = scheduler.interval + if hasattr(scheduler, "monitor"): + lr_scheduler_config["monitor"] = scheduler.monitor + lr_scheduler_configs.append(lr_scheduler_config) + + return [optimizer], lr_scheduler_configs + + def _apply_no_bias_decay(self) -> list[dict[str, Any]]: + """Apply no bias decay to bias parameters.""" + weights, biases = [], [] + for name, param in self.named_parameters(): + if "bias" in name: + biases += [param] + else: + weights += [param] + + return [{"params": biases, "weight_decay": 0}, {"params": weights, "weight_decay": 0.0001}] + + def forward_for_tracing( + self, + images: torch.Tensor, + calib_matrix: torch.Tensor, + img_sizes: torch.Tensor, + ) -> dict[str, torch.Tensor]: + """Model forward function used for the model tracing during model exportation.""" + return self.model(images=images, calibs=calib_matrix, img_sizes=img_sizes, mode="export") + + @staticmethod + def extract_dets_from_outputs(outputs: dict[str, torch.Tensor], topk: int = 50) -> tuple[torch.Tensor, ...]: + """Extract detection results from model outputs.""" + # b, q, c + out_logits = outputs["scores"] + out_bbox = outputs["boxes_3d"] + + prob = out_logits.sigmoid() + topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), topk, dim=1) + + # final scores + scores = topk_values + # final indexes + topk_boxes = (topk_indexes // out_logits.shape[2]).unsqueeze(-1) + # final labels + labels = topk_indexes % out_logits.shape[2] + + heading = outputs["heading_angle"] + size_3d = outputs["size_3d"] + depth = outputs["depth"] + # decode boxes + boxes_3d = torch.gather(out_bbox, 1, topk_boxes.repeat(1, 1, 6)) # b, q', 4 + # heading angle decoding + heading = torch.gather(heading, 1, topk_boxes.repeat(1, 1, 24)) + # depth decoding + depth = torch.gather(depth, 1, topk_boxes.repeat(1, 1, 2)) + # 3d dims decoding + size_3d = torch.gather(size_3d, 1, topk_boxes.repeat(1, 1, 3)) + # 2d boxes of the corners decoding + + return labels, scores, size_3d, heading, boxes_3d, depth + + @property + def _exporter(self) -> OTXModelExporter: + """Creates OTXModelExporter object that can export the model.""" + if self.input_size is None: + msg = f"Input size attribute is not set for {self.__class__}" + raise ValueError(msg) + + return OTXObjectDetection3DExporter( + task_level_export_parameters=self._export_parameters, + input_size=(1, 3, *self.input_size), + mean=self.mean, + std=self.std, + resize_mode="standard", + swap_rgb=False, + via_onnx=False, + onnx_export_configuration={ + "input_names": ["images", "calib_matrix", "img_sizes"], + "dynamic_axes": { + "images": {0: "batch"}, + "boxes_3d": {0: "batch", 1: "num_dets"}, + "scores": {0: "batch", 1: "num_dets"}, + "heading_angle": {0: "batch", 1: "num_dets"}, + "depth": {0: "batch", 1: "num_dets"}, + "size_3d": {0: "batch", 1: "num_dets"}, + }, + "autograd_inlining": False, + "opset_version": 16, + }, + input_names=["images", "calib_matrix", "img_sizes"], + output_names=["scores", "boxes_3d", "size_3d", "heading_angle", "depth"], + ) + + @property + def _optimization_config(self) -> dict[str, Any]: + """PTQ config for MonoDETR.""" + return {"model_type": "transformer"} diff --git a/src/otx/algo/object_detection_3d/utils/__init__.py b/src/otx/algo/object_detection_3d/utils/__init__.py new file mode 100644 index 00000000000..c951fff3de8 --- /dev/null +++ b/src/otx/algo/object_detection_3d/utils/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""utils module for object detection 3D models.""" diff --git a/src/otx/algo/object_detection_3d/utils/utils.py b/src/otx/algo/object_detection_3d/utils/utils.py new file mode 100644 index 00000000000..6f9c009d697 --- /dev/null +++ b/src/otx/algo/object_detection_3d/utils/utils.py @@ -0,0 +1,66 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""utils for object detection 3D models.""" +from __future__ import annotations + +import copy + +import torch +from torch import Tensor, nn + + +# TODO(Kirill): try to remove this class +class NestedTensor: + """Nested tensor class for object detection 3D models.""" + + def __init__(self, tensors: Tensor, mask: Tensor) -> None: + """Initialize a NestedTensor object. + + Args: + tensors (Tensor): The tensors representing the nested structure. + mask (Tensor): The mask indicating the valid elements in the tensors. + """ + self.tensors = tensors + self.mask = mask + + def to(self, device: torch.device) -> NestedTensor: + """Move the NestedTensor object to the specified device. + + Args: + device: The device to move the tensors to. + + Returns: + NestedTensor: The NestedTensor object with tensors moved to the specified device. + """ + cast_tensor = self.tensors.to(device) + cast_mask = self.mask.to(device) if self.mask is not None else None + return NestedTensor(cast_tensor, cast_mask) + + def decompose(self) -> tuple[Tensor, Tensor]: + """Decompose the NestedTensor object into its constituent tensors and masks.""" + return self.tensors, self.mask + + def __repr__(self) -> str: + """Return a string representation of the NestedTensor object.""" + return str(self.tensors) + + +def box_cxcylrtb_to_xyxy(x: Tensor) -> Tensor: + """Transform bbox from cxcylrtb to xyxy representation.""" + x_c, y_c, k, r, t, b = x.unbind(-1) + bb = [(x_c - k), (y_c - t), (x_c + r), (y_c + b)] + return torch.stack(bb, dim=-1) + + +def get_clones(module: nn.Module, n: int) -> nn.ModuleList: + """Create a list of cloned modules. + + Args: + module (nn.Module): The module to be cloned. + N (int): The number of clones to create. + + Returns: + nn.ModuleList: The list of cloned modules. + """ + return nn.ModuleList([copy.deepcopy(module) for _ in range(n)]) diff --git a/src/otx/core/data/dataset/object_detection_3d.py b/src/otx/core/data/dataset/object_detection_3d.py new file mode 100644 index 00000000000..7e7f294c58b --- /dev/null +++ b/src/otx/core/data/dataset/object_detection_3d.py @@ -0,0 +1,307 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Module for OTX3DObjectDetectionDataset.""" + +# mypy: ignore-errors + +from __future__ import annotations + +from copy import deepcopy +from functools import partial +from typing import TYPE_CHECKING, Any, Callable, List, Union + +import numpy as np +import torch +from datumaro import Image +from PIL import Image as PILImage +from torchvision import tv_tensors + +from otx.core.data.dataset.utils.kitti_utils import Calibration, affine_transform, angle2class, get_affine_transform +from otx.core.data.entity.base import ImageInfo +from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity, Det3DDataEntity +from otx.core.data.mem_cache import NULL_MEM_CACHE_HANDLER, MemCacheHandlerBase +from otx.core.data.transform_libs.torchvision import Compose +from otx.core.types.image import ImageColorChannel + +from .base import OTXDataset + +if TYPE_CHECKING: + from datumaro import Bbox, DatasetSubset + + +Transforms = Union[Compose, Callable, List[Callable], dict[str, Compose | Callable | List[Callable]]] + + +class OTX3DObjectDetectionDataset(OTXDataset[Det3DDataEntity]): + """OTXDataset class for detection task.""" + + def __init__( + self, + dm_subset: DatasetSubset, + transforms: Transforms, + mem_cache_handler: MemCacheHandlerBase = NULL_MEM_CACHE_HANDLER, + mem_cache_img_max_size: tuple[int, int] | None = None, + max_refetch: int = 1000, + image_color_channel: ImageColorChannel = ImageColorChannel.RGB, + stack_images: bool = True, + to_tv_image: bool = True, + max_objects: int = 50, + depth_threshold: int = 65, + resolution: tuple[int, int] = (1280, 384), # (W, H) + ) -> None: + super().__init__( + dm_subset, + transforms, + mem_cache_handler, + mem_cache_img_max_size, + max_refetch, + image_color_channel, + stack_images, + to_tv_image, + ) + self.max_objects = max_objects + self.depth_threshold = depth_threshold + self.resolution = np.array(resolution) # TODO(Kirill): make it configurable + self.subset_type = list(self.dm_subset.get_subset_info())[-1].split(":")[0] + + def _get_item_impl(self, index: int) -> Det3DDataEntity | None: + entity = self.dm_subset[index] + image = entity.media_as(Image) + image = self._get_img_data_and_shape(image)[0] + calib = Calibration(entity.attributes["calib_path"]) + original_kitti_format = None # don't use for training + if self.subset_type != "train": + # TODO (Kirill): remove this or duplication of the inputs + annotations_copy = deepcopy(entity.annotations) + original_kitti_format = [obj.attributes for obj in annotations_copy] + # decode original kitti format for metric calculation + for i, anno_dict in enumerate(original_kitti_format): + anno_dict["name"] = self.label_info.label_names[annotations_copy[i].label] + anno_dict["bbox"] = annotations_copy[i].points + dimension = anno_dict["dimensions"] + anno_dict["dimensions"] = [dimension[2], dimension[0], dimension[1]] + original_kitti_format = self._reformate_for_kitti_metric(original_kitti_format) + # decode labels for training + inputs, targets, ori_img_shape = self._decode_item( + PILImage.fromarray(image), + entity.annotations, + calib, + ) + # normilize image + inputs = self._apply_transforms(torch.as_tensor(inputs, dtype=torch.float32)) + return Det3DDataEntity( + image=inputs, + img_info=ImageInfo( + img_idx=index, + img_shape=inputs.shape[1:], + ori_shape=ori_img_shape, # TODO(Kirill): curently we use WxH here, make it HxW + image_color_channel=self.image_color_channel, + ignored_labels=[], + ), + boxes=tv_tensors.BoundingBoxes( + targets["boxes"], + format=tv_tensors.BoundingBoxFormat.XYXY, + canvas_size=inputs.shape[1:], + dtype=torch.float32, + ), + labels=torch.as_tensor(targets["labels"], dtype=torch.long), + calib_matrix=torch.as_tensor(calib.P2, dtype=torch.float32), + boxes_3d=torch.as_tensor(targets["boxes_3d"], dtype=torch.float32), + size_2d=torch.as_tensor(targets["size_2d"], dtype=torch.float32), + size_3d=torch.as_tensor(targets["size_3d"], dtype=torch.float32), + depth=torch.as_tensor(targets["depth"], dtype=torch.float32), + heading_angle=torch.as_tensor( + np.concatenate([targets["heading_bin"], targets["heading_res"]], axis=1), + dtype=torch.float32, + ), + original_kitti_format=original_kitti_format, + ) + + @property + def collate_fn(self) -> Callable: + """Collection function to collect DetDataEntity into DetBatchDataEntity in data loader.""" + return partial(Det3DBatchDataEntity.collate_fn, stack_images=self.stack_images) + + def _decode_item(self, img: PILImage, annotations: list[Bbox], calib: Calibration) -> tuple: # noqa: C901 + """Decode item for training.""" + # data augmentation for image + img_size = np.array(img.size) + bbox2d = np.array([ann.points for ann in annotations]) + center = img_size / 2 + crop_size, crop_scale = img_size, 1 + random_flip_flag = False + # TODO(Kirill): add data augmentation for 3d, remove them from here. + if self.subset_type == "train": + if np.random.random() < 0.5: + random_flip_flag = True + img = img.transpose(PILImage.FLIP_LEFT_RIGHT) + + if np.random.random() < 0.5: + scale = 0.05 + shift = 0.05 + crop_scale = np.clip(np.random.randn() * scale + 1, 1 - scale, 1 + scale) + crop_size = img_size * crop_scale + center[0] += img_size[0] * np.clip(np.random.randn() * shift, -2 * shift, 2 * shift) + center[1] += img_size[1] * np.clip(np.random.randn() * shift, -2 * shift, 2 * shift) + + # add affine transformation for 2d images. + trans, trans_inv = get_affine_transform(center, crop_size, 0, self.resolution, inv=1) + img = img.transform( + tuple(self.resolution.tolist()), + method=PILImage.AFFINE, + data=tuple(trans_inv.reshape(-1).tolist()), + resample=PILImage.BILINEAR, + ) + img = np.array(img).astype(np.float32) + img = img.transpose(2, 0, 1) # C * H * W -> (384 * 1280) + # ============================ get labels ============================== + # data augmentation for labels + annotations_list: list[dict[str, Any]] = [ann.attributes for ann in annotations] + for i, obj in enumerate(annotations_list): + obj["label"] = annotations[i].label + obj["location"] = np.array(obj["location"]) + + if random_flip_flag: + for i in range(bbox2d.shape[0]): + [x1, _, x2, _] = bbox2d[i] + bbox2d[i][0], bbox2d[i][2] = img_size[0] - x2, img_size[0] - x1 + annotations_list[i]["alpha"] = np.pi - annotations_list[i]["alpha"] + annotations_list[i]["rotation_y"] = np.pi - annotations_list[i]["rotation_y"] + if annotations_list[i]["alpha"] > np.pi: + annotations_list[i]["alpha"] -= 2 * np.pi # check range + if annotations_list[i]["alpha"] < -np.pi: + annotations_list[i]["alpha"] += 2 * np.pi + if annotations_list[i]["rotation_y"] > np.pi: + annotations_list[i]["rotation_y"] -= 2 * np.pi + if annotations_list[i]["rotation_y"] < -np.pi: + annotations_list[i]["rotation_y"] += 2 * np.pi + + # labels encoding + mask_2d = np.zeros((self.max_objects), dtype=bool) + labels = np.zeros((self.max_objects), dtype=np.int8) + depth = np.zeros((self.max_objects, 1), dtype=np.float32) + heading_bin = np.zeros((self.max_objects, 1), dtype=np.int64) + heading_res = np.zeros((self.max_objects, 1), dtype=np.float32) + size_2d = np.zeros((self.max_objects, 2), dtype=np.float32) + size_3d = np.zeros((self.max_objects, 3), dtype=np.float32) + src_size_3d = np.zeros((self.max_objects, 3), dtype=np.float32) + boxes = np.zeros((self.max_objects, 4), dtype=np.float32) + boxes_3d = np.zeros((self.max_objects, 6), dtype=np.float32) + + object_num = len(annotations) if len(annotations) < self.max_objects else self.max_objects + for i in range(object_num): + cur_obj = annotations_list[i] + # ignore the samples beyond the threshold [hard encoding] + if cur_obj["location"][-1] > self.depth_threshold and cur_obj["location"][-1] < 2: + continue + + # process 2d bbox & get 2d center + bbox_2d = bbox2d[i].copy() + + # add affine transformation for 2d boxes. + bbox_2d[:2] = affine_transform(bbox_2d[:2], trans) + bbox_2d[2:] = affine_transform(bbox_2d[2:], trans) + + # process 3d center + center_2d = np.array( + [(bbox_2d[0] + bbox_2d[2]) / 2, (bbox_2d[1] + bbox_2d[3]) / 2], + dtype=np.float32, + ) # W * H + corner_2d = bbox_2d.copy() + + center_3d = np.array( + cur_obj["location"] + + [ + 0, + -cur_obj["dimensions"][0] / 2, + 0, + ], + ) # real 3D center in 3D space + center_3d = center_3d.reshape(-1, 3) # shape adjustment (N, 3) + center_3d, _ = calib.rect_to_img(center_3d) # project 3D center to image plane + center_3d = center_3d[0] # shape adjustment + if random_flip_flag: # random flip for center3d + center_3d[0] = img_size[0] - center_3d[0] + center_3d = affine_transform(center_3d.reshape(-1), trans) + + # filter 3d center out of img + proj_inside_img = True + + if center_3d[0] < 0 or center_3d[0] >= self.resolution[0]: + proj_inside_img = False + if center_3d[1] < 0 or center_3d[1] >= self.resolution[1]: + proj_inside_img = False + + if not proj_inside_img: + continue + + # class + labels[i] = cur_obj["label"] + + # encoding 2d/3d boxes + w, h = bbox_2d[2] - bbox_2d[0], bbox_2d[3] - bbox_2d[1] + size_2d[i] = 1.0 * w, 1.0 * h + + center_2d_norm = center_2d / self.resolution + size_2d_norm = size_2d[i] / self.resolution + + corner_2d_norm = corner_2d + corner_2d_norm[0:2] = corner_2d[0:2] / self.resolution + corner_2d_norm[2:4] = corner_2d[2:4] / self.resolution + center_3d_norm = center_3d / self.resolution + + k, r = center_3d_norm[0] - corner_2d_norm[0], corner_2d_norm[2] - center_3d_norm[0] + t, b = center_3d_norm[1] - corner_2d_norm[1], corner_2d_norm[3] - center_3d_norm[1] + + if k < 0 or r < 0 or t < 0 or b < 0: + continue + + boxes[i] = center_2d_norm[0], center_2d_norm[1], size_2d_norm[0], size_2d_norm[1] + boxes_3d[i] = center_3d_norm[0], center_3d_norm[1], k, r, t, b + + # encoding depth + depth[i] = cur_obj["location"][-1] * crop_scale + + # encoding heading angle + heading_angle = calib.ry2alpha(cur_obj["rotation_y"], (bbox2d[i][0] + bbox2d[i][2]) / 2) + if heading_angle > np.pi: + heading_angle -= 2 * np.pi # check range + if heading_angle < -np.pi: + heading_angle += 2 * np.pi + heading_bin[i], heading_res[i] = angle2class(heading_angle) + + # encoding size_3d + src_size_3d[i] = np.array([cur_obj["dimensions"]], dtype=np.float32) + size_3d[i] = src_size_3d[i] + + # filter out the samples with truncated or occluded + if cur_obj["truncated"] <= 0.5 and cur_obj["occluded"] <= 2: + mask_2d[i] = 1 + + # collect return data + targets_for_train = { + "labels": labels[mask_2d], + "boxes": boxes[mask_2d], + "boxes_3d": boxes_3d[mask_2d], + "depth": depth[mask_2d], + "size_2d": size_2d[mask_2d], + "size_3d": size_3d[mask_2d], + "heading_bin": heading_bin[mask_2d], + "heading_res": heading_res[mask_2d], + } + + return img, targets_for_train, img_size + + def _reformate_for_kitti_metric(self, annotations: dict[str, Any]) -> dict[str, np.array]: + """Reformat the annotation for KITTI metric.""" + return { + "name": np.array([obj["name"] for obj in annotations]), + "alpha": np.array([obj["alpha"] for obj in annotations]), + "bbox": np.array([obj["bbox"] for obj in annotations]).reshape(-1, 4), + "dimensions": np.array([obj["dimensions"] for obj in annotations]).reshape(-1, 3), + "location": np.array([obj["location"] for obj in annotations]).reshape(-1, 3), + "rotation_y": np.array([obj["rotation_y"] for obj in annotations]), + "occluded": np.array([obj["occluded"] for obj in annotations]), + "truncated": np.array([obj["truncated"] for obj in annotations]), + } diff --git a/src/otx/core/data/dataset/segmentation.py b/src/otx/core/data/dataset/segmentation.py index 363a15e84cc..ee23be6090e 100644 --- a/src/otx/core/data/dataset/segmentation.py +++ b/src/otx/core/data/dataset/segmentation.py @@ -14,7 +14,6 @@ from datumaro.components.annotation import Ellipse, Image, Mask, Polygon from torchvision import tv_tensors -from otx.core.data.dataset.base import Transforms from otx.core.data.entity.base import ImageInfo from otx.core.data.entity.segmentation import SegBatchDataEntity, SegDataEntity from otx.core.data.mem_cache import NULL_MEM_CACHE_HANDLER, MemCacheHandlerBase @@ -27,6 +26,8 @@ from datumaro import Dataset as DmDataset from datumaro import DatasetItem + from otx.core.data.dataset.base import Transforms + # NOTE: It is copied from https://github.com/openvinotoolkit/datumaro/pull/1409 # It will be replaced in the future. diff --git a/src/otx/core/data/dataset/utils/__init__.py b/src/otx/core/data/dataset/utils/__init__.py new file mode 100644 index 00000000000..0c75fd7a904 --- /dev/null +++ b/src/otx/core/data/dataset/utils/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Module defines utils for OTXDatasets.""" diff --git a/src/otx/core/data/dataset/utils/kitti_utils.py b/src/otx/core/data/dataset/utils/kitti_utils.py new file mode 100644 index 00000000000..1ee16c41733 --- /dev/null +++ b/src/otx/core/data/dataset/utils/kitti_utils.py @@ -0,0 +1,299 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Module defines utils for KITTI Dataset.""" + +# flake8: noqa +# mypy: ignore-errors + +import cv2 +import numpy as np + + +def get_calib_from_file(calib_file): + with open(calib_file) as f: + lines = f.readlines() + + obj = lines[2].strip().split(" ")[1:] + P2 = np.array(obj, dtype=np.float32) + obj = lines[3].strip().split(" ")[1:] + P3 = np.array(obj, dtype=np.float32) + obj = lines[4].strip().split(" ")[1:] + R0 = np.array(obj, dtype=np.float32) + obj = lines[5].strip().split(" ")[1:] + Tr_velo_to_cam = np.array(obj, dtype=np.float32) + + return { + "P2": P2.reshape(3, 4), + "P3": P3.reshape(3, 4), + "R0": R0.reshape(3, 3), + "Tr_velo2cam": Tr_velo_to_cam.reshape(3, 4), + } + + +class Calibration: + def __init__(self, calib_file): + if isinstance(calib_file, str): + calib = get_calib_from_file(calib_file) + else: + calib = calib_file + + self.P2 = calib["P2"] # 3 x 4 + self.R0 = calib["R0"] # 3 x 3 + self.V2C = calib["Tr_velo2cam"] # 3 x 4 + self.C2V = self.inverse_rigid_trans(self.V2C) + + # Camera intrinsics and extrinsics + self.cu = self.P2[0, 2] + self.cv = self.P2[1, 2] + self.fu = self.P2[0, 0] + self.fv = self.P2[1, 1] + self.tx = self.P2[0, 3] / (-self.fu) + self.ty = self.P2[1, 3] / (-self.fv) + + def cart_to_hom(self, pts): + """:param pts: (N, 3 or 2) + :return pts_hom: (N, 4 or 3) + """ + pts_hom = np.hstack((pts, np.ones((pts.shape[0], 1), dtype=np.float32))) + return pts_hom + + def lidar_to_rect(self, pts_lidar): + """:param pts_lidar: (N, 3) + :return pts_rect: (N, 3) + """ + pts_lidar_hom = self.cart_to_hom(pts_lidar) + pts_rect = np.dot(pts_lidar_hom, np.dot(self.V2C.T, self.R0.T)) + # pts_rect = reduce(np.dot, (pts_lidar_hom, self.V2C.T, self.R0.T)) + return pts_rect + + def rect_to_lidar(self, pts_rect): + pts_ref = np.transpose(np.dot(np.linalg.inv(self.R0), np.transpose(pts_rect))) + pts_ref = self.cart_to_hom(pts_ref) # nx4 + return np.dot(pts_ref, np.transpose(self.C2V)) + + def rect_to_img(self, pts_rect): + """:param pts_rect: (N, 3) + :return pts_img: (N, 2) + """ + pts_rect_hom = self.cart_to_hom(pts_rect) + pts_2d_hom = np.dot(pts_rect_hom, self.P2.T) + pts_img = (pts_2d_hom[:, 0:2].T / pts_rect_hom[:, 2]).T # (N, 2) + pts_rect_depth = pts_2d_hom[:, 2] - self.P2.T[3, 2] # depth in rect camera coord + return pts_img, pts_rect_depth + + def lidar_to_img(self, pts_lidar): + """:param pts_lidar: (N, 3) + :return pts_img: (N, 2) + """ + pts_rect = self.lidar_to_rect(pts_lidar) + pts_img, pts_depth = self.rect_to_img(pts_rect) + return pts_img, pts_depth + + def img_to_rect(self, u, v, depth_rect): + """:param u: (N) + :param v: (N) + :param depth_rect: (N) + :return: + """ + x = ((u - self.cu) * depth_rect) / self.fu + self.tx + y = ((v - self.cv) * depth_rect) / self.fv + self.ty + pts_rect = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1), depth_rect.reshape(-1, 1)), axis=1) + return pts_rect + + def depthmap_to_rect(self, depth_map): + """:param depth_map: (H, W), depth_map + :return: + """ + x_range = np.arange(0, depth_map.shape[1]) + y_range = np.arange(0, depth_map.shape[0]) + x_idxs, y_idxs = np.meshgrid(x_range, y_range) + x_idxs, y_idxs = x_idxs.reshape(-1), y_idxs.reshape(-1) + depth = depth_map[y_idxs, x_idxs] + pts_rect = self.img_to_rect(x_idxs, y_idxs, depth) + return pts_rect, x_idxs, y_idxs + + def corners3d_to_img_boxes(self, corners3d): + """:param corners3d: (N, 8, 3) corners in rect coordinate + :return: boxes: (None, 4) [x1, y1, x2, y2] in rgb coordinate + :return: boxes_corner: (None, 8) [xi, yi] in rgb coordinate + """ + sample_num = corners3d.shape[0] + corners3d_hom = np.concatenate((corners3d, np.ones((sample_num, 8, 1))), axis=2) # (N, 8, 4) + + img_pts = np.matmul(corners3d_hom, self.P2.T) # (N, 8, 3) + + x, y = img_pts[:, :, 0] / img_pts[:, :, 2], img_pts[:, :, 1] / img_pts[:, :, 2] + x1, y1 = np.min(x, axis=1), np.min(y, axis=1) + x2, y2 = np.max(x, axis=1), np.max(y, axis=1) + + boxes = np.concatenate((x1.reshape(-1, 1), y1.reshape(-1, 1), x2.reshape(-1, 1), y2.reshape(-1, 1)), axis=1) + boxes_corner = np.concatenate((x.reshape(-1, 8, 1), y.reshape(-1, 8, 1)), axis=2) + + return boxes, boxes_corner + + def camera_dis_to_rect(self, u, v, d): + """Can only process valid u, v, d, which means u, v can not beyond the image shape, reprojection error 0.02 + :param u: (N) + :param v: (N) + :param d: (N), the distance between camera and 3d points, d^2 = x^2 + y^2 + z^2 + :return: + """ + assert self.fu == self.fv, "%.8f != %.8f" % (self.fu, self.fv) + fd = np.sqrt((u - self.cu) ** 2 + (v - self.cv) ** 2 + self.fu**2) + x = ((u - self.cu) * d) / fd + self.tx + y = ((v - self.cv) * d) / fd + self.ty + z = np.sqrt(d**2 - x**2 - y**2) + pts_rect = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1), z.reshape(-1, 1)), axis=1) + return pts_rect + + def inverse_rigid_trans(self, Tr): + """Inverse a rigid body transform matrix (3x4 as [R|t]) + [R'|-R't; 0|1] + """ + inv_Tr = np.zeros_like(Tr) # 3x4 + inv_Tr[0:3, 0:3] = np.transpose(Tr[0:3, 0:3]) + inv_Tr[0:3, 3] = np.dot(-np.transpose(Tr[0:3, 0:3]), Tr[0:3, 3]) + return inv_Tr + + def alpha2ry(self, alpha, u): + """Get rotation_y by alpha + theta - 180 + alpha : Observation angle of object, ranging [-pi..pi] + x : Object center x to the camera center (x-W/2), in pixels + rotation_y : Rotation ry around Y-axis in camera coordinates [-pi..pi] + """ + ry = alpha + np.arctan2(u - self.cu, self.fu) + + if ry > np.pi: + ry -= 2 * np.pi + if ry < -np.pi: + ry += 2 * np.pi + + return ry + + def ry2alpha(self, ry, u): + alpha = ry - np.arctan2(u - self.cu, self.fu) + + if alpha > np.pi: + alpha -= 2 * np.pi + if alpha < -np.pi: + alpha += 2 * np.pi + + return alpha + + def flip(self, img_size): + wsize = 4 + hsize = 2 + p2ds = ( + np.concatenate( + [ + np.expand_dims(np.tile(np.expand_dims(np.linspace(0, img_size[0], wsize), 0), [hsize, 1]), -1), + np.expand_dims(np.tile(np.expand_dims(np.linspace(0, img_size[1], hsize), 1), [1, wsize]), -1), + np.linspace(2, 78, wsize * hsize).reshape(hsize, wsize, 1), + ], + -1, + ) + ).reshape(-1, 3) + p3ds = self.img_to_rect(p2ds[:, 0:1], p2ds[:, 1:2], p2ds[:, 2:3]) + p3ds[:, 0] *= -1 + p2ds[:, 0] = img_size[0] - p2ds[:, 0] + + # self.P2[0,3] *= -1 + cos_matrix = np.zeros([wsize * hsize, 2, 7]) + cos_matrix[:, 0, 0] = p3ds[:, 0] + cos_matrix[:, 0, 1] = cos_matrix[:, 1, 2] = p3ds[:, 2] + cos_matrix[:, 1, 0] = p3ds[:, 1] + cos_matrix[:, 0, 3] = cos_matrix[:, 1, 4] = 1 + cos_matrix[:, :, -2] = -p2ds[:, :2] + cos_matrix[:, :, -1] = -p2ds[:, :2] * p3ds[:, 2:3] + new_calib = np.linalg.svd(cos_matrix.reshape(-1, 7))[-1][-1] + new_calib /= new_calib[-1] + + new_calib_matrix = np.zeros([4, 3]).astype(np.float32) + new_calib_matrix[0, 0] = new_calib_matrix[1, 1] = new_calib[0] + new_calib_matrix[2, 0:2] = new_calib[1:3] + new_calib_matrix[3, :] = new_calib[3:6] + new_calib_matrix[-1, -1] = self.P2[-1, -1] + self.P2 = new_calib_matrix.T + self.cu = self.P2[0, 2] + self.cv = self.P2[1, 2] + self.fu = self.P2[0, 0] + self.fv = self.P2[1, 1] + self.tx = self.P2[0, 3] / (-self.fu) + self.ty = self.P2[1, 3] / (-self.fv) + + +def get_dir(src_point, rot_rad): + sn, cs = np.sin(rot_rad), np.cos(rot_rad) + + src_result = [0, 0] + src_result[0] = src_point[0] * cs - src_point[1] * sn + src_result[1] = src_point[0] * sn + src_point[1] * cs + + return src_result + + +def get_3rd_point(a, b): + direct = a - b + return b + np.array([-direct[1], direct[0]], dtype=np.float32) + + +def get_affine_transform(center, scale, rot, output_size, shift=np.array([0, 0], dtype=np.float32), inv=0): + if not isinstance(scale, np.ndarray) and not isinstance(scale, list): + scale = np.array([scale, scale], dtype=np.float32) + + scale_tmp = scale + src_w = scale_tmp[0] + dst_w = output_size[0] + dst_h = output_size[1] + + rot_rad = np.pi * rot / 180 + src_dir = get_dir([0, src_w * -0.5], rot_rad) + dst_dir = np.array([0, dst_w * -0.5], np.float32) + + src = np.zeros((3, 2), dtype=np.float32) + dst = np.zeros((3, 2), dtype=np.float32) + src[0, :] = center + scale_tmp * shift + src[1, :] = center + src_dir + scale_tmp * shift + dst[0, :] = [dst_w * 0.5, dst_h * 0.5] + dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5], np.float32) + dst_dir + + src[2:, :] = get_3rd_point(src[0, :], src[1, :]) + dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :]) + + if inv: + trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) + trans_inv = cv2.getAffineTransform(np.float32(dst), np.float32(src)) + return trans, trans_inv + else: + trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) + return trans + + +def affine_transform(pt, t): + new_pt = np.array([pt[0], pt[1], 1.0], dtype=np.float32).T + new_pt = np.dot(t, new_pt) + return new_pt[:2] + + +def angle2class(angle): + """Convert continuous angle to discrete class and residual.""" + num_heading_bin = 12 + angle = angle % (2 * np.pi) + assert angle >= 0 and angle <= 2 * np.pi + angle_per_class = 2 * np.pi / float(num_heading_bin) + shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi) + class_id = int(shifted_angle / angle_per_class) + residual_angle = shifted_angle - (class_id * angle_per_class + angle_per_class / 2) + return class_id, residual_angle + + +def class2angle(cls: int, residual: float, to_label_format: bool = False) -> float: + """Inverse function to angle2class.""" + num_heading_bin = 12 + angle_per_class = 2 * np.pi / float(num_heading_bin) + angle_center = cls * angle_per_class + angle = angle_center + residual + if to_label_format and angle > np.pi: + angle = angle - 2 * np.pi + return angle diff --git a/src/otx/core/data/entity/object_detection_3d.py b/src/otx/core/data/entity/object_detection_3d.py new file mode 100644 index 00000000000..564ea283a60 --- /dev/null +++ b/src/otx/core/data/entity/object_detection_3d.py @@ -0,0 +1,148 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Module for OTX detection data entities.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any + +from torchvision import tv_tensors + +from otx.core.data.entity.base import ( + OTXBatchDataEntity, + OTXBatchPredEntity, + OTXDataEntity, + OTXPredEntity, +) +from otx.core.data.entity.utils import register_pytree_node +from otx.core.types.task import OTXTaskType + +if TYPE_CHECKING: + from torch import LongTensor, Tensor + + +@register_pytree_node +@dataclass +class Det3DDataEntity(OTXDataEntity): + """Data entity for detection task. + + :param bboxes: Bbox annotations as top-left-bottom-right + (x1, y1, x2, y2) format with absolute coordinate values + :param labels: Bbox labels as integer indices + """ + + @property + def task(self) -> OTXTaskType: + """OTX Task type definition.""" + return OTXTaskType.OBJECT_DETECTION_3D + + boxes: tv_tensors.BoundingBoxes + calib_matrix: Tensor + boxes_3d: Tensor + size_2d: Tensor + size_3d: Tensor + depth: Tensor + heading_angle: Tensor + labels: LongTensor + original_kitti_format: list[dict[str, Any]] | None + + +@dataclass +class Det3DPredEntity(OTXPredEntity, Det3DDataEntity): + """Data entity to represent the detection model output prediction.""" + + +@dataclass +class Det3DBatchDataEntity(OTXBatchDataEntity[Det3DDataEntity]): + """Data entity for detection task. + + :param bboxes: A list of bbox annotations as top-left-bottom-right + (x1, y1, x2, y2) format with absolute coordinate values + :param labels: A list of bbox labels as integer indices + """ # TODO(Kirill): UPDATE! + + images: Tensor + boxes: list[tv_tensors.BoundingBoxes] + calib_matrix: list[Tensor] + boxes_3d: list[Tensor] + size_2d: list[Tensor] + size_3d: list[Tensor] + depth: list[Tensor] + heading_angle: list[Tensor] + labels: list[LongTensor] + original_kitti_format: list[list[dict[str, Any]] | None] + + @property + def task(self) -> OTXTaskType: + """OTX Task type definition.""" + return OTXTaskType.OBJECT_DETECTION_3D + + @classmethod + def collate_fn( + cls, + entities: list[Det3DDataEntity], + stack_images: bool = True, + ) -> Det3DBatchDataEntity: + """Collection function to collect `DetDataEntity` into `DetBatchDataEntity` in data loader. + + Args: + entities: List of `DetDataEntity`. + stack_images: If True, return 4D B x C x H x W image tensor. + Otherwise return a list of 3D C x H x W image tensor. + + Returns: + Collated `DetBatchDataEntity` + """ + batch_data = super().collate_fn(entities, stack_images=stack_images) + batch_input_shape = tuple(batch_data.images[0].size()[-2:]) + for info in batch_data.imgs_info: + info.batch_input_shape = batch_input_shape + return Det3DBatchDataEntity( + batch_size=batch_data.batch_size, + images=batch_data.images, + imgs_info=batch_data.imgs_info, + boxes=[entity.boxes for entity in entities], + labels=[entity.labels for entity in entities], + calib_matrix=[entity.calib_matrix for entity in entities], + boxes_3d=[entity.boxes_3d for entity in entities], + size_2d=[entity.size_2d for entity in entities], + size_3d=[entity.size_3d for entity in entities], + depth=[entity.depth for entity in entities], + heading_angle=[entity.heading_angle for entity in entities], + original_kitti_format=[entity.original_kitti_format for entity in entities], + ) + + def pin_memory(self) -> Det3DBatchDataEntity: + """Pin memory for member tensor variables.""" + return ( + super() + .pin_memory() + .wrap( + boxes=[tv_tensors.wrap(bbox.pin_memory(), like=bbox) for bbox in self.boxes], + labels=[label.pin_memory() for label in self.labels], + calib_matrix=[calib_matrix.pin_memory() for calib_matrix in self.calib_matrix], + boxes_3d=[boxes_3d.pin_memory() for boxes_3d in self.boxes_3d], + size_2d=[size_2d.pin_memory() for size_2d in self.size_2d], + size_3d=[size_3d.pin_memory() for size_3d in self.size_3d], + depth=[depth.pin_memory() for depth in self.depth], + heading_angle=[heading_angle.pin_memory() for heading_angle in self.heading_angle], + original_kitti_format=self.original_kitti_format, + ) + ) + + +@dataclass +class Det3DBatchPredEntity(OTXBatchPredEntity, Det3DBatchDataEntity): + """Data entity to represent model output predictions for detection task.""" + + boxes: tv_tensors.BoundingBoxes + scores: Tensor + calib_matrix: Tensor + boxes_3d: Tensor + size_2d: Tensor + size_3d: Tensor + depth: Tensor + heading_angle: Tensor + labels: Tensor diff --git a/src/otx/core/data/factory.py b/src/otx/core/data/factory.py index 5d424673fee..92f836c5605 100644 --- a/src/otx/core/data/factory.py +++ b/src/otx/core/data/factory.py @@ -156,4 +156,9 @@ def create( # noqa: PLR0911 return OTXKeypointDetectionDataset(**common_kwargs) + if task == OTXTaskType.OBJECT_DETECTION_3D: + from .dataset.object_detection_3d import OTX3DObjectDetectionDataset + + return OTX3DObjectDetectionDataset(**common_kwargs) + raise NotImplementedError(task) diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py index 259fa6f6447..f9b7cac8fd4 100644 --- a/src/otx/core/data/module.py +++ b/src/otx/core/data/module.py @@ -184,6 +184,7 @@ def __init__( # noqa: PLR0913 ) label_infos: list[LabelInfo] = [] + for name, dm_subset in dataset.subsets().items(): if name not in config_mapping: log.warning(f"{name} is not available. Skip it") @@ -209,7 +210,6 @@ def __init__( # noqa: PLR0913 tile_config=self.tile_config, ) self.subsets[name] = dataset - label_infos += [self.subsets[name].label_info] log.info(f"Add name: {name}, self.subsets: {self.subsets}") diff --git a/src/otx/core/data/pre_filtering.py b/src/otx/core/data/pre_filtering.py index f78d8fe1db2..b3898a78f04 100644 --- a/src/otx/core/data/pre_filtering.py +++ b/src/otx/core/data/pre_filtering.py @@ -72,7 +72,11 @@ def is_valid_annot(item: DatasetItem, annotation: Annotation) -> bool: # noqa: return True -def remove_unused_labels(dataset: DmDataset, data_format: str, ignore_index: int | None) -> DmDataset: +def remove_unused_labels( + dataset: DmDataset, + data_format: str, + ignore_index: int | None, +) -> DmDataset: """Remove unused labels in Datumaro dataset.""" original_categories: list[str] = dataset.get_label_cat_names() used_labels: list[int] = list({ann.label for item in dataset for ann in item.annotations}) @@ -99,4 +103,5 @@ def remove_unused_labels(dataset: DmDataset, data_format: str, ignore_index: int mapping = {original_categories[idx]: original_categories[idx] for idx in used_labels} msg = "There are unused labels in dataset, they will be filtered out before training." warnings.warn(msg, stacklevel=2) + return dataset.transform("remap_labels", mapping=mapping, default="delete") diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py index 1beb7fec87d..0bdb4a48baa 100644 --- a/src/otx/core/data/utils/utils.py +++ b/src/otx/core/data/utils/utils.py @@ -10,7 +10,8 @@ import cv2 import numpy as np -from datumaro.components.annotation import AnnotationType, Bbox, Polygon, _Shape +from datumaro.components.annotation import AnnotationType, Bbox, Polygon +from datumaro.components.annotation import Shape as _Shape from otx.core.types import OTXTaskType diff --git a/src/otx/core/exporter/base.py b/src/otx/core/exporter/base.py index 8a9cbd8d1de..85d77fe4799 100644 --- a/src/otx/core/exporter/base.py +++ b/src/otx/core/exporter/base.py @@ -57,6 +57,7 @@ def __init__( pad_value: int = 0, swap_rgb: bool = False, output_names: list[str] | None = None, + input_names: list[str] | None = None, ) -> None: self.input_size = input_size self.mean = mean @@ -66,6 +67,7 @@ def __init__( self.swap_rgb = swap_rgb self.task_level_export_parameters = task_level_export_parameters self.output_names = output_names + self.input_names = input_names @property def metadata(self) -> dict[tuple[str, str], str]: @@ -319,6 +321,40 @@ def _postprocess_openvino_model(self, exported_model: openvino.Model) -> openvin ) raise RuntimeError(msg) + if self.input_names is not None: + if len(exported_model.inputs) >= len(self.input_names): + if len(exported_model.inputs) != len(self.input_names): + msg = ( + "Number of model inputs is greater than the number" + " of input names to assign. Please check input_names" + " argument of the exporter's constructor." + ) + log.warning(msg) + + for i, name in enumerate(self.input_names): + traced_names = exported_model.inputs[i].get_names() + name_found = False + for traced_name in traced_names: + if name in traced_name: + name_found = True + break + name_found = name_found and bool(len(traced_names)) + + if not name_found: + msg = ( + f"{name} is not matched with the converted model's traced input names: {traced_names}." + " Please check input_names argument of the exporter's constructor." + ) + log.warning(msg) + + exported_model.inputs[i].tensor.set_names({name}) + else: + msg = ( + "Model has less inputs than the number of input names provided: " + f"{len(exported_model.inputs)} vs {len(self.input_names)}" + ) + raise RuntimeError(msg) + if self.metadata is not None: export_metadata = self._extend_model_metadata(self.metadata) exported_model = self._embed_openvino_ir_metadata(exported_model, export_metadata) diff --git a/src/otx/core/exporter/detection_3d.py b/src/otx/core/exporter/detection_3d.py new file mode 100644 index 00000000000..17b1377436a --- /dev/null +++ b/src/otx/core/exporter/detection_3d.py @@ -0,0 +1,100 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Class definition for object detection 3D model exporter used in OTX.""" + +from __future__ import annotations + +import logging as log +from pathlib import Path +from typing import TYPE_CHECKING + +import onnx +import openvino +import torch + +from otx.core.exporter.native import OTXNativeModelExporter +from otx.core.types.precision import OTXPrecisionType + +if TYPE_CHECKING: + from otx.core.model.base import OTXModel + + +class OTXObjectDetection3DExporter(OTXNativeModelExporter): + """Class definition for object detection 3D model exporter used in OTX.""" + + def to_openvino( + self, + model: OTXModel, + output_dir: Path, + base_model_name: str = "exported_model", + precision: OTXPrecisionType = OTXPrecisionType.FP32, + ) -> Path: + """Export to OpenVINO Intermediate Representation format. + + In this implementation the export is done only via standard OV/ONNX tools. + """ + device = next(model.parameters()).device + dummy_tensor = torch.rand(self.input_size).to(device) + dummy_calib_matrix = torch.rand(1, 3, 4).to(device) + dummy_image_sizes = torch.tensor([self.input_size[::-1][:2]]).to(device) + + exported_model = openvino.convert_model( + model, + example_input={"images": dummy_tensor, "calib_matrix": dummy_calib_matrix, "img_sizes": dummy_image_sizes}, + input=( + openvino.runtime.PartialShape(self.input_size), + openvino.runtime.PartialShape([1, 3, 4]), + openvino.runtime.PartialShape([1, 2]), + ), + ) + exported_model = self._postprocess_openvino_model(exported_model) + + save_path = output_dir / (base_model_name + ".xml") + openvino.save_model(exported_model, save_path, compress_to_fp16=(precision == OTXPrecisionType.FP16)) + log.info("Converting to OpenVINO is done.") + + return Path(save_path) + + def to_onnx( + self, + model: OTXModel, + output_dir: Path, + base_model_name: str = "exported_model", + precision: OTXPrecisionType = OTXPrecisionType.FP32, + embed_metadata: bool = True, + ) -> Path: + """Export the given PyTorch model to ONNX format and save it to the specified output directory. + + Args: + model (OTXModel): The PyTorch model to be exported. + output_dir (Path): The directory where the ONNX model will be saved. + base_model_name (str, optional): The base name for the exported model. Defaults to "exported_model". + precision (OTXPrecisionType, optional): The precision type for the exported model. + Defaults to OTXPrecisionType.FP32. + embed_metadata (bool, optional): Whether to embed metadata in the ONNX model. Defaults to True. + + Returns: + Path: The path to the saved ONNX model. + """ + dummy_tensor = torch.rand(self.input_size) + dummy_calib_matrix = torch.rand(1, 3, 4) + dummy_image_sizes = torch.tensor([self.input_size[::-1][:2]]) + dummy_inputs = {"images": dummy_tensor, "calib_matrix": dummy_calib_matrix, "img_sizes": dummy_image_sizes} + + save_path = str(output_dir / (base_model_name + ".onnx")) + + torch.onnx.export( + model, + args=tuple(dummy_inputs.values()), + f=save_path, + **self.onnx_export_configuration, + ) + + onnx_model = onnx.load(save_path) + onnx_model = self._postprocess_onnx_model(onnx_model, embed_metadata, precision) + + onnx.save(onnx_model, save_path) + log.info("Converting to ONNX is done.") + + return Path(save_path) diff --git a/src/otx/core/exporter/native.py b/src/otx/core/exporter/native.py index 11f90b9451d..5f901ede2b5 100644 --- a/src/otx/core/exporter/native.py +++ b/src/otx/core/exporter/native.py @@ -37,6 +37,7 @@ def __init__( via_onnx: bool = False, onnx_export_configuration: dict[str, Any] | None = None, output_names: list[str] | None = None, + input_names: list[str] | None = None, ) -> None: super().__init__( task_level_export_parameters=task_level_export_parameters, @@ -47,6 +48,7 @@ def __init__( pad_value=pad_value, swap_rgb=swap_rgb, output_names=output_names, + input_names=input_names, ) self.via_onnx = via_onnx self.onnx_export_configuration = onnx_export_configuration if onnx_export_configuration is not None else {} diff --git a/src/otx/core/metrics/average_precision_3d.py b/src/otx/core/metrics/average_precision_3d.py new file mode 100644 index 00000000000..7b8530ba684 --- /dev/null +++ b/src/otx/core/metrics/average_precision_3d.py @@ -0,0 +1,67 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Module for OTX metric used for 3D object detection tasks.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from torch import Tensor +from torchmetrics import Metric + +from otx.core.metrics.kitti_3d_eval import get_coco_eval_result + +if TYPE_CHECKING: + import numpy as np + + from otx.core.types.label import LabelInfo + + +class KittiMetric(Metric): + """Computes the 2D/3D average precision (coco style) for object detection 3d task. + + Args: + label_info (int): Dataclass including label information. + """ + + def __init__( + self, + label_info: LabelInfo, + ): + super().__init__() + + self.label_info: LabelInfo = label_info + self.reset() + + def reset(self) -> None: + """Reset for every validation and test epoch. + + Please be careful that some variables should not be reset for each epoch. + """ + super().reset() + self.preds: list[dict[str, np.array]] = [] + self.targets: list[dict[str, np.array]] = [] + + def update(self, preds: list[dict[str, Tensor]], target: list[dict[str, Tensor]]) -> None: + """Update total predictions and targets from given batch predicitons and targets.""" + self.preds.extend(preds) + self.targets.extend(target) + + def compute(self) -> dict: + """Compute metrics for 3d object detection.""" + current_classes = self.label_info.label_names + map_bbox, map_3d = get_coco_eval_result( + self.targets, + self.preds, + current_classes=[curcls.lower() for curcls in current_classes], + ) + # use moderate difficulty as final score. Average across all calsses. + return {"mAP_bbox_3d": Tensor([map_3d[:, 1].mean()]), "mAP_bbox_2d": Tensor([map_bbox[:, 1].mean()])} + + +def _kitti_metric_measure_callable(label_info: LabelInfo) -> KittiMetric: + return KittiMetric(label_info=label_info) + + +KittiMetricCallable = _kitti_metric_measure_callable diff --git a/src/otx/core/metrics/kitti_3d_eval/__init__.py b/src/otx/core/metrics/kitti_3d_eval/__init__.py new file mode 100644 index 00000000000..236c84981f2 --- /dev/null +++ b/src/otx/core/metrics/kitti_3d_eval/__init__.py @@ -0,0 +1,8 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Module for kitti 3d evaluation.""" + +from .eval import get_coco_eval_result + +__all__ = ["get_coco_eval_result"] diff --git a/src/otx/core/metrics/kitti_3d_eval/eval.py b/src/otx/core/metrics/kitti_3d_eval/eval.py new file mode 100644 index 00000000000..951cc96538d --- /dev/null +++ b/src/otx/core/metrics/kitti_3d_eval/eval.py @@ -0,0 +1,811 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""KITTI 3D eval for OTX.""" + +# flake8: noqa +# mypy: ignore-errors + +from __future__ import annotations + +import io as sysio +from typing import Any + +import numba +import numpy as np +import torch + +if torch.cuda.is_available(): + from .rotate_gpu_iou import rotate_iou_eval_gpu as rotate_iou_eval +else: + from .rotate_iou import rotate_iou_eval_cpu as rotate_iou_eval + + +@numba.jit(nopython=True) +def get_thresholds( + scores: np.ndarray, # 1D array of confidence scores + num_gt: int, # Number of ground truth objects + num_sample_pts: int = 41, # Number of sample points used to compute recall thresholds +) -> np.ndarray: # 1D array of recall thresholds + """Compute recall thresholds for a given score array. + + Args: + scores (np.ndarray): 1D array of confidence scores. + num_gt (int): Number of ground truth objects. + num_sample_pts (int, optional): Number of sample points used to + compute recall thresholds. Defaults to 41. + + Returns: + np.ndarray: 1D array of recall thresholds. + """ + scores.sort() + scores = scores[::-1] + current_recall = 0 + thresholds = [] + for i, score in enumerate(scores): + l_recall = (i + 1) / num_gt + if i < (len(scores) - 1): + r_recall = (i + 2) / num_gt + else: + r_recall = l_recall + if ((r_recall - current_recall) < (current_recall - l_recall)) and (i < (len(scores) - 1)): + continue + # recall = l_recall + thresholds.append(score) + current_recall += 1 / (num_sample_pts - 1.0) + return thresholds + + +def clean_data( + gt_anno: dict, # ground truth annotations + dt_anno: dict, # detection results + current_class: str, # the current class name + difficulty: int, # the difficulty level +) -> tuple: # (num_valid_gt, ignored_gt, ignored_dt, dc_bboxes) + """Filter out the objects that are not in the current class. + + Args: + gt_anno (dict): Ground truth annotations. + dt_anno (dict): Detection results. + current_class (str): The current class name. + difficulty (int): The difficulty level. + + Returns: + tuple: The number of valid objects, ignored_gt, ignored_dt, and dc_bboxes. + """ + MIN_HEIGHT = [40, 25, 25] + MAX_OCCLUSION = [0, 1, 2] + MAX_TRUNCATION = [0.15, 0.3, 0.5] + dc_bboxes, ignored_gt, ignored_dt = [], [], [] + num_gt = len(gt_anno["name"]) + num_dt = len(dt_anno["name"]) + num_valid_gt = 0 + for i in range(num_gt): + bbox = gt_anno["bbox"][i] + gt_name = gt_anno["name"][i].lower() + height = bbox[3] - bbox[1] + valid_class = -1 + if gt_name == current_class: + valid_class = 1 + elif current_class == "Pedestrian".lower() and "Person_sitting".lower() == gt_name: + valid_class = 0 + elif current_class == "Car".lower() and "Van".lower() == gt_name: + valid_class = 0 + else: + valid_class = -1 + ignore = False + if ( + (gt_anno["occluded"][i] > MAX_OCCLUSION[difficulty]) + or (gt_anno["truncated"][i] > MAX_TRUNCATION[difficulty]) + or (height <= MIN_HEIGHT[difficulty]) + ): + # if gt_anno["difficulty"][i] > difficulty or gt_anno["difficulty"][i] == -1: + ignore = True + if valid_class == 1 and not ignore: + ignored_gt.append(0) + num_valid_gt += 1 + elif valid_class == 0 or (ignore and (valid_class == 1)): + ignored_gt.append(1) + else: + ignored_gt.append(-1) + # for i in range(num_gt): + if gt_anno["name"][i] == "DontCare": + dc_bboxes.append(gt_anno["bbox"][i]) + for i in range(num_dt): + if dt_anno["name"][i].lower() == current_class: + valid_class = 1 + else: + valid_class = -1 + height = abs(dt_anno["bbox"][i, 3] - dt_anno["bbox"][i, 1]) + if height < MIN_HEIGHT[difficulty]: + ignored_dt.append(1) + elif valid_class == 1: + ignored_dt.append(0) + else: + ignored_dt.append(-1) + + return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes + + +@numba.jit(nopython=True) +def image_box_overlap( + boxes: np.ndarray, # shape: (N, 4) + query_boxes: np.ndarray, # shape: (K, 4) + criterion: int = -1, # default overlap criterion, -1: intersection over union, 0: intersection over box area, 1: intersection over query box area +) -> np.ndarray: # shape: (N, K) + """Args: + boxes (np.ndarray): shape: (N, 4), 2D boxes, (x1, y1, x2, y2) + query_boxes (np.ndarray): shape: (K, 4), 2D boxes, (x1, y1, x2, y2) + criterion (int, optional): overlap criterion, -1: intersection over union, 0: intersection over box area, 1: intersection over query box area. Defaults to -1. + + Returns: + np.ndarray: shape: (N, K), overlap between boxes and query_boxes + """ + N = boxes.shape[0] + K = query_boxes.shape[0] + overlaps = np.zeros((N, K), dtype=boxes.dtype) + for k in range(K): + qbox_area = (query_boxes[k, 2] - query_boxes[k, 0]) * (query_boxes[k, 3] - query_boxes[k, 1]) + for n in range(N): + iw = min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0]) + if iw > 0: + ih = min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1]) + if ih > 0: + if criterion == -1: + ua = (boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1]) + qbox_area - iw * ih + elif criterion == 0: + ua = (boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1]) + elif criterion == 1: + ua = qbox_area + else: + ua = 1.0 + overlaps[n, k] = iw * ih / ua + return overlaps + + +@numba.jit(nopython=True) +def d3_box_overlap_kernel( + boxes: np.ndarray, # shape: (N, 7) + qboxes: np.ndarray, # shape: (K, 7) + rinc: np.ndarray, # shape: (N, K) + criterion: int = -1, # default overlap criterion +) -> None: + """Args: + boxes: Array of shape (N, 7) representing N 3D boxes. + qboxes: Array of shape (K, 7) representing K 3D boxes. + rinc: Array of shape (N, K) representing the overlap between boxes + and qboxes. + criterion: Overlap criterion. Defaults to -1. If -1, uses the + intersection-over-union (IoU) criterion. If 0, uses the + intersection-over-area1 criterion. If 1, uses the + intersection-over-area2 criterion. + + Returns: + None + """ + # ONLY support overlap in CAMERA, not lidar. + N, K = boxes.shape[0], qboxes.shape[0] + for i in range(N): + for j in range(K): + if rinc[i, j] > 0: + # iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] + + # qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1])) + iw = min(boxes[i, 1], qboxes[j, 1]) - max(boxes[i, 1] - boxes[i, 4], qboxes[j, 1] - qboxes[j, 4]) + + if iw > 0: + area1 = boxes[i, 3] * boxes[i, 4] * boxes[i, 5] + area2 = qboxes[j, 3] * qboxes[j, 4] * qboxes[j, 5] + inc = iw * rinc[i, j] + if criterion == -1: + ua = area1 + area2 - inc + elif criterion == 0: + ua = area1 + elif criterion == 1: + ua = area2 + else: + ua = inc + rinc[i, j] = inc / ua + else: + rinc[i, j] = 0.0 + + +@numba.jit(nopython=True) +def compute_statistics_jit( + overlaps: np.ndarray, # shape: (total_dt_num, total_gt_num) + gt_datas: np.ndarray, # shape: (total_gt_num, 7) + dt_datas: np.ndarray, # shape: (total_dt_num, 7) + ignored_gt: list[int], # shape: (total_gt_num) + ignored_det: list[int], # shape: (total_dt_num) + dc_bboxes: np.ndarray, # shape: (total_dc_num, 4) + metric: int, + min_overlap: float, + thresh: float = 0, + compute_fp: bool = False, + compute_aos: bool = False, +) -> tuple[int, int, int, float, np.ndarray]: + """This function computes statistics of an evaluation. + + Args: + overlaps (np.ndarray): Overlap between dt and gt bboxes. + gt_datas (np.ndarray): Ground truth data. + dt_datas (np.ndarray): Detection data. + ignored_gt (List[int]): Ignore ground truth indices. + ignored_det (List[int]): Ignore detection indices. + dc_bboxes (np.ndarray): Don't care bboxes. + metric (int): Evaluation metric. + min_overlap (float): Minimum overlap between dt and gt bboxes. + thresh (float): Detection score threshold. Defaults to 0. + compute_fp (bool): Whether to compute false positives. Defaults to False. + compute_aos (bool): Whether to compute average orientation similarity. Defaults to False. + + Returns: + Tuple[int, int, int, float, np.ndarray]: tp, fp, fn, similarity, thresholds + """ + det_size = dt_datas.shape[0] + gt_size = gt_datas.shape[0] + dt_scores = dt_datas[:, -1] + dt_alphas = dt_datas[:, 4] + gt_alphas = gt_datas[:, 4] + dt_bboxes = dt_datas[:, :4] + + assigned_detection = [False] * det_size + ignored_threshold = [False] * det_size + if compute_fp: + for i in range(det_size): + if dt_scores[i] < thresh: + ignored_threshold[i] = True + NO_DETECTION = -10000000 + tp, fp, fn, similarity = 0, 0, 0, 0 + # thresholds = [0.0] + # delta = [0.0] + thresholds = np.zeros((gt_size,)) + thresh_idx = 0 + delta = np.zeros((gt_size,)) + delta_idx = 0 + for i in range(gt_size): + if ignored_gt[i] == -1: + continue + det_idx = -1 + valid_detection = NO_DETECTION + max_overlap = 0 + assigned_ignored_det = False + + for j in range(det_size): + if ignored_det[j] == -1: + continue + if assigned_detection[j]: + continue + if ignored_threshold[j]: + continue + overlap = overlaps[j, i] + dt_score = dt_scores[j] + if not compute_fp and (overlap > min_overlap) and dt_score > valid_detection: + det_idx = j + valid_detection = dt_score + elif ( + compute_fp + and (overlap > min_overlap) + and (overlap > max_overlap or assigned_ignored_det) + and ignored_det[j] == 0 + ): + max_overlap = overlap + det_idx = j + valid_detection = 1 + assigned_ignored_det = False + elif compute_fp and (overlap > min_overlap) and (valid_detection == NO_DETECTION) and ignored_det[j] == 1: + det_idx = j + valid_detection = 1 + assigned_ignored_det = True + + if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0: + fn += 1 + elif (valid_detection != NO_DETECTION) and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1): + assigned_detection[det_idx] = True + elif valid_detection != NO_DETECTION: + tp += 1 + # thresholds.append(dt_scores[det_idx]) + thresholds[thresh_idx] = dt_scores[det_idx] + thresh_idx += 1 + if compute_aos: + # delta.append(gt_alphas[i] - dt_alphas[det_idx]) + delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx] + delta_idx += 1 + + assigned_detection[det_idx] = True + if compute_fp: + for i in range(det_size): + if not (assigned_detection[i] or ignored_det[i] == -1 or ignored_det[i] == 1 or ignored_threshold[i]): + fp += 1 + nstuff = 0 + if metric == 0: + overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0) + for i in range(dc_bboxes.shape[0]): + for j in range(det_size): + if assigned_detection[j]: + continue + if ignored_det[j] == -1 or ignored_det[j] == 1: + continue + if ignored_threshold[j]: + continue + if overlaps_dt_dc[j, i] > min_overlap: + assigned_detection[j] = True + nstuff += 1 + fp -= nstuff + if compute_aos: + tmp = np.zeros((fp + delta_idx,)) + # tmp = [0] * fp + for i in range(delta_idx): + tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0 + # tmp.append((1.0 + np.cos(delta[i])) / 2.0) + # assert len(tmp) == fp + tp + # assert len(delta) == tp + if tp > 0 or fp > 0: + similarity = np.sum(tmp) + else: + similarity = -1 + return tp, fp, fn, similarity, thresholds[:thresh_idx] + + +@numba.jit(nopython=True) +def get_split_parts(num: int, num_part: int) -> list[int]: + """Split a number into parts. + + Args: + num (int): The number to split. + num_part (int): The number of parts to split into. + + Returns: + List[int]: A list of the parts. + """ + same_part = num // num_part + remain_num = num % num_part + if same_part == 0: + return [num] + + if remain_num == 0: + return [same_part] * num_part + else: + return [same_part] * num_part + [remain_num] + + +@numba.jit(nopython=True) +def fused_compute_statistics( + overlaps: np.ndarray, # shape: (total_dt_num, total_gt_num) + pr: np.ndarray, # shape: (num_thresholds, 4) + gt_nums: np.ndarray, # shape: (num_samples) + dt_nums: np.ndarray, # shape: (num_samples) + dc_nums: np.ndarray, # shape: (num_samples) + gt_datas: np.ndarray, # shape: (total_gt_num, 7) + dt_datas: np.ndarray, # shape: (total_dt_num, 7) + dontcares: np.ndarray, # shape: (total_dc_num, 4) + ignored_gts: np.ndarray, # shape: (total_gt_num) + ignored_dets: np.ndarray, # shape: (total_dt_num) + metric: int, + min_overlap: float, + thresholds: np.ndarray, # shape: (num_thresholds) + compute_aos: bool = False, +) -> None: + """Fast compute statistics. Must be used in CAMERA coordinate system. + + Args: + overlaps: 2D array of shape (total_dt_num, total_gt_num) + [dt_num, gt_num] is the overlap between dt_num-th detection + and gt_num-th ground truth + pr: 2D array of shape (num_thresholds, 4) + [t, 0] is the number of true positives at threshold t + [t, 1] is the number of false positives at threshold t + [t, 2] is the number of false negatives at threshold t + [t, 3] is the similarity at threshold t + gt_nums: 1D array of shape (num_samples) + gt_nums[i] is the number of ground truths in i-th sample + dt_nums: 1D array of shape (num_samples) + dt_nums[i] is the number of detections in i-th sample + dc_nums: 1D array of shape (num_samples) + dc_nums[i] is the number of dontcare areas in i-th sample + gt_datas: 2D array of shape (total_gt_num, 7) + gt_datas[i] is the i-th ground truth box + dt_datas: 2D array of shape (total_dt_num, 7) + dt_datas[i] is the i-th detection box + dontcares: 2D array of shape (total_dc_num, 4) + dontcares[i] is the i-th dontcare area + ignored_gts: 1D array of shape (total_gt_num) + ignored_gts[i] is 1 if the i-th ground truth is ignored, 0 otherwise + ignored_dets: 1D array of shape (total_dt_num) + ignored_dets[i] is 1 if the i-th detection is ignored, 0 otherwise + metric: Eval type. 0: bbox, 1: bev, 2: 3d + min_overlap: Min overlap + thresholds: 1D array of shape (num_thresholds) + thresholds[i] is the i-th threshold + compute_aos: Whether to compute aos + """ + gt_num = 0 + dt_num = 0 + dc_num = 0 + for i in range(gt_nums.shape[0]): + for t, thresh in enumerate(thresholds): + overlap = overlaps[dt_num : dt_num + dt_nums[i], gt_num : gt_num + gt_nums[i]] + gt_data = gt_datas[gt_num : gt_num + gt_nums[i]] + dt_data = dt_datas[dt_num : dt_num + dt_nums[i]] + ignored_gt = ignored_gts[gt_num : gt_num + gt_nums[i]] + ignored_det = ignored_dets[dt_num : dt_num + dt_nums[i]] + dontcare = dontcares[dc_num : dc_num + dc_nums[i]] + tp, fp, fn, similarity, _ = compute_statistics_jit( + overlap, + gt_data, + dt_data, + ignored_gt, + ignored_det, + dontcare, + metric, + min_overlap=min_overlap, + thresh=thresh, + compute_fp=True, + compute_aos=compute_aos, + ) + pr[t, 0] += tp + pr[t, 1] += fp + pr[t, 2] += fn + if similarity != -1: + pr[t, 3] += similarity + gt_num += gt_nums[i] + dt_num += dt_nums[i] + dc_num += dc_nums[i] + + +def calculate_iou_partly( + gt_annos: list[dict[str, Any]], + dt_annos: list[dict[str, Any]], + metric: int, + num_parts: int = 50, +) -> tuple[list[np.ndarray], list[np.ndarray], np.ndarray, np.ndarray]: + """Fast iou algorithm. This function can be used independently to + do result analysis. Must be used in CAMERA coordinate system. + + Args: + gt_annos: List of dict, must from get_label_annos() in kitti_common.py + dt_annos: List of dict, must from get_label_annos() in kitti_common.py + metric: Eval type. 0: bbox, 1: bev, 2: 3d + num_parts: Int, a parameter for fast calculate algorithm + + Returns: + Tuple of + overlaps: List of numpy arrays, shape (num_gt, num_dt) + parted_overlaps: List of numpy arrays, shape (num_gt, num_dt) + total_gt_num: Numpy array, shape (num_images,) + total_dt_num: Numpy array, shape (num_images,) + """ + + def d3_box_overlap(boxes, qboxes, criterion=-1): + rinc = rotate_iou_eval(boxes[:, [0, 2, 3, 5, 6]], qboxes[:, [0, 2, 3, 5, 6]], 2) + d3_box_overlap_kernel(boxes, qboxes, rinc, criterion) + return rinc + + assert len(gt_annos) == len(dt_annos) + total_dt_num = np.stack([len(a["name"]) for a in dt_annos], 0) + total_gt_num = np.stack([len(a["name"]) for a in gt_annos], 0) + num_examples = len(gt_annos) + split_parts = get_split_parts(num_examples, num_parts) + parted_overlaps = [] + example_idx = 0 + + for num_part in split_parts: + gt_annos_part = gt_annos[example_idx : example_idx + num_part] + dt_annos_part = dt_annos[example_idx : example_idx + num_part] + if metric == 0: + gt_boxes = np.concatenate([a["bbox"] for a in gt_annos_part], 0) + dt_boxes = np.concatenate([a["bbox"] for a in dt_annos_part], 0) + overlap_part = image_box_overlap(gt_boxes, dt_boxes) + elif metric == 2: + loc = np.concatenate([a["location"] for a in gt_annos_part], 0) + dims = np.concatenate([a["dimensions"] for a in gt_annos_part], 0) + rots = np.concatenate([a["rotation_y"] for a in gt_annos_part], 0) + gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) + loc = np.concatenate([a["location"] for a in dt_annos_part], 0) + dims = np.concatenate([a["dimensions"] for a in dt_annos_part], 0) + rots = np.concatenate([a["rotation_y"] for a in dt_annos_part], 0) + dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) + overlap_part = d3_box_overlap(gt_boxes, dt_boxes).astype(np.float64) + else: + raise ValueError("unknown metric") + parted_overlaps.append(overlap_part) + example_idx += num_part + overlaps = [] + example_idx = 0 + for j, num_part in enumerate(split_parts): + gt_annos_part = gt_annos[example_idx : example_idx + num_part] + dt_annos_part = dt_annos[example_idx : example_idx + num_part] + gt_num_idx, dt_num_idx = 0, 0 + for i in range(num_part): + gt_box_num = total_gt_num[example_idx + i] + dt_box_num = total_dt_num[example_idx + i] + overlaps.append( + parted_overlaps[j][gt_num_idx : gt_num_idx + gt_box_num, dt_num_idx : dt_num_idx + dt_box_num], + ) + gt_num_idx += gt_box_num + dt_num_idx += dt_box_num + example_idx += num_part + + return overlaps, parted_overlaps, total_gt_num, total_dt_num + + +def _prepare_data( + gt_annos: list[dict[str, Any]], + dt_annos: list[dict[str, Any]], + current_class: str, + difficulty: int, +) -> tuple[list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray], np.ndarray, int]: + """Prepare data for evaluation. + + Args: + gt_annos (List[Dict[str, Any]]): Ground truth annotations. + dt_annos (List[Dict[str, Any]]): Detection annotations. + current_class (str): Current class name. + difficulty (int): Difficulty level. + + Returns: + Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray], List[np.ndarray], List[np.ndarray], np.ndarray, int]: + gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, total_dc_num, total_num_valid_gt + """ + gt_datas_list = [] + dt_datas_list = [] + total_dc_num = [] + ignored_gts, ignored_dets, dontcares = [], [], [] + total_num_valid_gt = 0 + for i in range(len(gt_annos)): + rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty) + num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets + ignored_gts.append(np.array(ignored_gt, dtype=np.int64)) + ignored_dets.append(np.array(ignored_det, dtype=np.int64)) + if len(dc_bboxes) == 0: + dc_bboxes = np.zeros((0, 4)).astype(np.float64) + else: + dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64) + total_dc_num.append(dc_bboxes.shape[0]) + dontcares.append(dc_bboxes) + total_num_valid_gt += num_valid_gt + gt_datas = np.concatenate([gt_annos[i]["bbox"], gt_annos[i]["alpha"][..., np.newaxis]], 1) + dt_datas = np.concatenate( + [ + dt_annos[i]["bbox"], + dt_annos[i]["alpha"][..., np.newaxis], + dt_annos[i]["score"][..., np.newaxis], + ], + 1, + ) + gt_datas_list.append(gt_datas) + dt_datas_list.append(dt_datas) + total_dc_num = np.stack(total_dc_num, axis=0) + return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, total_dc_num, total_num_valid_gt) + + +def eval_class( + gt_annos: list[dict[str, Any]], + dt_annos: list[dict[str, Any]], + current_classes: list[str], + difficultys: list[int], + metric: int, + min_overlaps: np.ndarray, + compute_aos: bool = False, + num_parts: int = 50, +) -> dict[str, np.ndarray]: + """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP. + + Args: + gt_annos: dict, must from get_label_annos() in kitti_common.py + dt_annos: dict, must from get_label_annos() in kitti_common.py + current_classes: list of label names + difficultys: list of int. eval difficulty, 0: easy, 1: normal, 2: hard + metric: eval type. 0: bbox, 1: bev, 2: 3d + min_overlaps: float, min overlap. format: [num_overlap, metric, class]. + num_parts: int. a parameter for fast calculate algorithm + + Returns: + dict of recall, precision and aos + """ + assert len(gt_annos) == len(dt_annos) + num_examples = len(gt_annos) + split_parts = get_split_parts(num_examples, num_parts) + + rets = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts) + overlaps, parted_overlaps, total_dt_num, total_gt_num = rets + N_SAMPLE_PTS = 41 + num_minoverlap = len(min_overlaps) + num_class = len(current_classes) + num_difficulty = len(difficultys) + precision = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) + recall = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) + aos = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) + for m, current_class in enumerate(current_classes): + for l, difficulty in enumerate(difficultys): + rets = _prepare_data(gt_annos, dt_annos, current_class, difficulty) + ( + gt_datas_list, + dt_datas_list, + ignored_gts, + ignored_dets, + dontcares, + total_dc_num, + total_num_valid_gt, + ) = rets + for k, min_overlap in enumerate(min_overlaps[:, metric, m]): + thresholdss = [] + for i in range(len(gt_annos)): + rets = compute_statistics_jit( + overlaps[i], + gt_datas_list[i], + dt_datas_list[i], + ignored_gts[i], + ignored_dets[i], + dontcares[i], + metric, + min_overlap=min_overlap, + thresh=0.0, + compute_fp=False, + ) + tp, fp, fn, similarity, thresholds = rets + thresholdss += thresholds.tolist() + thresholdss = np.array(thresholdss) + thresholds = get_thresholds(thresholdss, total_num_valid_gt) + thresholds = np.array(thresholds) + pr = np.zeros([len(thresholds), 4]) + idx = 0 + for j, num_part in enumerate(split_parts): + gt_datas_part = np.concatenate(gt_datas_list[idx : idx + num_part], 0) + dt_datas_part = np.concatenate(dt_datas_list[idx : idx + num_part], 0) + dc_datas_part = np.concatenate(dontcares[idx : idx + num_part], 0) + ignored_dets_part = np.concatenate(ignored_dets[idx : idx + num_part], 0) + ignored_gts_part = np.concatenate(ignored_gts[idx : idx + num_part], 0) + fused_compute_statistics( + parted_overlaps[j], + pr, + total_gt_num[idx : idx + num_part], + total_dt_num[idx : idx + num_part], + total_dc_num[idx : idx + num_part], + gt_datas_part, + dt_datas_part, + dc_datas_part, + ignored_gts_part, + ignored_dets_part, + metric, + min_overlap=min_overlap, + thresholds=thresholds, + compute_aos=compute_aos, + ) + idx += num_part + for i in range(len(thresholds)): + recall[m, l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2]) + precision[m, l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 1]) + if compute_aos: + aos[m, l, k, i] = pr[i, 3] / (pr[i, 0] + pr[i, 1]) + for i in range(len(thresholds)): + precision[m, l, k, i] = np.max(precision[m, l, k, i:], axis=-1) + recall[m, l, k, i] = np.max(recall[m, l, k, i:], axis=-1) + if compute_aos: + aos[m, l, k, i] = np.max(aos[m, l, k, i:], axis=-1) + ret_dict = { + "recall": recall, + "precision": precision, + "orientation": aos, + } + return ret_dict + + +def print_str(value, *arg, sstream=None): + if sstream is None: + sstream = sysio.StringIO() + sstream.truncate(0) + sstream.seek(0) + print(value, *arg, file=sstream) + return sstream.getvalue() + + +def do_eval_cut_version( + gt_annos: list[dict[str, Any]], # type hint + dt_annos: list[dict[str, Any]], # type hint + current_classes: list[str], # type hint + min_overlaps: np.ndarray, # type hint + compute_aos: bool = False, # type hint +) -> tuple[float, float]: # type hint + """Evaluates detections with COCO style AP. + + Args: + gt_annos (List[dict]): Ground truth annotations. + dt_annos (List[dict]): Detection results. + current_classes (List[str]): Classes to evaluate. + min_overlaps (np.ndarray): Overlap ranges. + compute_aos (bool): Whether to compute aos. + + Returns: + Tuple[float, float]: Bounding box and 3D bounding box AP. + """ + + def _get_mAP(prec: np.ndarray) -> np.ndarray: + sums = 0 + for i in range(0, prec.shape[-1], 4): + sums = sums + prec[..., i] + return sums / 11 * 100 + + # min_overlaps: [num_minoverlap, metric, num_class] + difficultys = [0, 1, 2] + ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 0, min_overlaps, compute_aos) + # ret: [num_class, num_diff, num_minoverlap, num_sample_points] + # get 2D bbox mAP + mAP_bbox = _get_mAP(ret["precision"]) + + # get 3D bbox mAP + ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2, min_overlaps) + mAP_3d = _get_mAP(ret["precision"]) + + return mAP_bbox, mAP_3d + + +def get_coco_eval_result( + gt_annos: list[dict], + dt_annos: list[dict], + current_classes: list[str], +) -> tuple[np.ndarray, np.ndarray]: + """Evaluates detections with COCO style AP. + + Args: + gt_annos (List[dict]): Ground truth annotations. + dt_annos (List[dict]): Detection results. + current_classes (List[str]): Classes to evaluate. + + Returns: + Tuple[np.ndarray, np.ndarray]: Bounding box and 3D bounding box AP. + """ + + def do_coco_style_eval( + gt_annos: list[dict], + dt_annos: list[dict], + current_classes: list[str], + overlap_ranges: np.ndarray, + compute_aos: bool, + ) -> tuple[np.ndarray, np.ndarray]: + """Evaluates detections with COCO style AP. + + Args: + gt_annos (List[dict]): Ground truth annotations. + dt_annos (List[dict]): Detection results. + current_classes (List[str]): Classes to evaluate. + overlap_ranges (np.ndarray): Overlap ranges. + compute_aos (bool): Whether to compute aos. + + Returns: + Tuple[np.ndarray, np.ndarray]: Bounding box and 3D bounding box AP. + """ + min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]]) + + for i in range(overlap_ranges.shape[1]): + for j in range(overlap_ranges.shape[2]): + min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j][:2], 10) + + mAP_bbox, mAP_3d = do_eval_cut_version(gt_annos, dt_annos, current_classes, min_overlaps, compute_aos) + + return mAP_bbox.mean(-1), mAP_3d.mean(-1) + + iou_range = [0.5, 0.95, 10] + if not isinstance(current_classes, (list, tuple)): + current_classes = [current_classes] + + overlap_ranges = np.zeros([3, 3, len(current_classes)]) + for i, curcls in enumerate(current_classes): + # IoU from 0.5 to 0.95 + overlap_ranges[:, :, i] = np.array(iou_range)[:, np.newaxis] + result = "" + # check whether alpha is valid + compute_aos = False + mAPbbox, mAP3d = do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos) + + for j, curcls in enumerate(current_classes): + # mAP threshold array: [num_minoverlap, metric, class] + # mAP result: [num_class, num_diff, num_minoverlap] + o_range = np.array(iou_range)[[0, 2, 1]] + o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1) + result += print_str(f"{curcls} " "coco AP@{:.2f}:{:.2f}:{:.2f}:".format(*o_range)) + result += print_str(f"bbox AP:{mAPbbox[j, 0]:.2f}, {mAPbbox[j, 1]:.2f}, {mAPbbox[j, 2]:.2f}") + result += print_str(f"3d AP:{mAP3d[j, 0]:.2f}, {mAP3d[j, 1]:.2f}, {mAP3d[j, 2]:.2f}") + + print("\n COCO style evaluation results: \n", result) + + return mAPbbox, mAP3d diff --git a/src/otx/core/metrics/kitti_3d_eval/rotate_gpu_iou.py b/src/otx/core/metrics/kitti_3d_eval/rotate_gpu_iou.py new file mode 100644 index 00000000000..3aff2c4197a --- /dev/null +++ b/src/otx/core/metrics/kitti_3d_eval/rotate_gpu_iou.py @@ -0,0 +1,495 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Rotate IoU for KITTI3D metric, gpu version.""" + +import math + +import numba +import numpy as np +from numba import cuda + + +@numba.jit(nopython=True) +def div_up(m: int, n: int) -> int: + """Divide m by n and round up to the nearest integer. + + Args: + m (int): Numerator. + n (int): Denominator. + + Returns: + int: Result of the division rounded up to the nearest integer. + """ + return m // n + (m % n > 0) + + +@cuda.jit("(float32[:], float32[:], float32[:])", device=True, inline=True) +def trangle_area(a: cuda.local.array, b: cuda.local.array, c: cuda.local.array) -> float: + """Calculate the area of a triangle given its three vertices. + + Args: + a (cuda.local.array): First vertex of the triangle. + b (cuda.local.array): Second vertex of the triangle. + c (cuda.local.array): Third vertex of the triangle. + + Returns: + float: Area of the triangle. + """ + return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) * (b[0] - c[0])) / 2.0 + + +@cuda.jit("(float32[:], int32)", device=True, inline=True) +def area(int_pts: cuda.local.array, num_of_inter: int) -> float: + """Calculate the area of a polygon using the given intersection points. + + Args: + int_pts (cuda.local.array): Array of intersection points, shape (num_of_inter * 2,). + num_of_inter (int): Number of intersection points. + + Returns: + float: The calculated area of the polygon. + """ + area_val = 0.0 + for i in range(num_of_inter - 2): + area_val += abs(trangle_area(int_pts[:2], int_pts[2 * i + 2 : 2 * i + 4], int_pts[2 * i + 4 : 2 * i + 6])) + return area_val + + +@cuda.jit("(float32[:], int32)", device=True, inline=True) +def sort_vertex_in_convex_polygon(int_pts: cuda.local.array, num_of_inter: int) -> None: + """Sort the vertices of a convex polygon in counterclockwise order. + + Args: + int_pts (cuda.local.array): Array of intersection points. + num_of_inter (int): Number of intersection points. + """ + if num_of_inter > 0: + center = cuda.local.array((2,), dtype=numba.float32) + center[:] = 0.0 + for i in range(num_of_inter): + center[0] += int_pts[2 * i] + center[1] += int_pts[2 * i + 1] + center[0] /= num_of_inter + center[1] /= num_of_inter + v = cuda.local.array((2,), dtype=numba.float32) + vs = cuda.local.array((16,), dtype=numba.float32) + for i in range(num_of_inter): + v[0] = int_pts[2 * i] - center[0] + v[1] = int_pts[2 * i + 1] - center[1] + d = math.sqrt(v[0] * v[0] + v[1] * v[1]) + v[0] = v[0] / d + v[1] = v[1] / d + if v[1] < 0: + v[0] = -2 - v[0] + vs[i] = v[0] + j = 0 + temp = 0 + for i in range(1, num_of_inter): + if vs[i - 1] > vs[i]: + temp = vs[i] + tx = int_pts[2 * i] + ty = int_pts[2 * i + 1] + j = i + while j > 0 and vs[j - 1] > temp: + vs[j] = vs[j - 1] + int_pts[j * 2] = int_pts[j * 2 - 2] + int_pts[j * 2 + 1] = int_pts[j * 2 - 1] + j -= 1 + + vs[j] = temp + int_pts[j * 2] = tx + int_pts[j * 2 + 1] = ty + + +@cuda.jit("(float32[:], float32[:], int32, int32, float32[:])", device=True, inline=True) +def line_segment_intersection( + pts1: cuda.local.array, # array of points representing the first line segment + pts2: cuda.local.array, # array of points representing the second line segment + i: int, # index of the first line segment + j: int, # index of the second line segment + temp_pts: cuda.local.array, # array to store the intersection point +) -> bool: + """Check if two line segments intersect and find the intersection point. + + Args: + pts1 (cuda.local.array): Array of points representing the first line segment. + pts2 (cuda.local.array): Array of points representing the second line segment. + i (int): Index of the first line segment. + j (int): Index of the second line segment. + temp_pts (cuda.local.array): Array to store the intersection point. + + Returns: + bool: True if the line segments intersect, False otherwise. + """ + a = cuda.local.array((2,), dtype=numba.float32) + b = cuda.local.array((2,), dtype=numba.float32) + c = cuda.local.array((2,), dtype=numba.float32) + d = cuda.local.array((2,), dtype=numba.float32) + + a[0] = pts1[2 * i] + a[1] = pts1[2 * i + 1] + + b[0] = pts1[2 * ((i + 1) % 4)] + b[1] = pts1[2 * ((i + 1) % 4) + 1] + + c[0] = pts2[2 * j] + c[1] = pts2[2 * j + 1] + + d[0] = pts2[2 * ((j + 1) % 4)] + d[1] = pts2[2 * ((j + 1) % 4) + 1] + ba0 = b[0] - a[0] + ba1 = b[1] - a[1] + da0 = d[0] - a[0] + ca0 = c[0] - a[0] + da1 = d[1] - a[1] + ca1 = c[1] - a[1] + acd = da1 * ca0 > ca1 * da0 + bcd = (d[1] - b[1]) * (c[0] - b[0]) > (c[1] - b[1]) * (d[0] - b[0]) + if acd != bcd: + abc = ca1 * ba0 > ba1 * ca0 + abd = da1 * ba0 > ba1 * da0 + if abc != abd: + dc0 = d[0] - c[0] + dc1 = d[1] - c[1] + abba = a[0] * b[1] - b[0] * a[1] + cddc = c[0] * d[1] - d[0] * c[1] + dh = ba1 * dc0 - ba0 * dc1 + dx = abba * dc0 - ba0 * cddc + dy = abba * dc1 - ba1 * cddc + temp_pts[0] = dx / dh + temp_pts[1] = dy / dh + return True + return False + + +@cuda.jit("(float32[:], float32[:], int32, int32, float32[:])", device=True, inline=True) +def line_segment_intersection_v1( + pts1: cuda.local.array, # array of points representing the first line segment + pts2: cuda.local.array, # array of points representing the second line segment + i: int, # index of the first line segment + j: int, # index of the second line segment + temp_pts: cuda.local.array, # array to store the intersection point +) -> bool: + """Check if two line segments intersect and find the intersection point using an alternative method. + + Args: + pts1(cuda.local.array): array of points representing the first line segment + pts2(cuda.local.array): cuda.local.array, array of points representing the second line segment + i(int): int, index of the first line segment + j(int): int, index of the second line segment + temp_pts(cuda.local.array): array to store the intersection point + + Returns: + bool: True if the line segments intersect, False otherwise + """ + a = cuda.local.array((2,), dtype=numba.float32) + b = cuda.local.array((2,), dtype=numba.float32) + c = cuda.local.array((2,), dtype=numba.float32) + d = cuda.local.array((2,), dtype=numba.float32) + + a[0] = pts1[2 * i] + a[1] = pts1[2 * i + 1] + + b[0] = pts1[2 * ((i + 1) % 4)] + b[1] = pts1[2 * ((i + 1) % 4) + 1] + + c[0] = pts2[2 * j] + c[1] = pts2[2 * j + 1] + + d[0] = pts2[2 * ((j + 1) % 4)] + d[1] = pts2[2 * ((j + 1) % 4) + 1] + + area_abc = trangle_area(a, b, c) + area_abd = trangle_area(a, b, d) + + if area_abc * area_abd >= 0: + return False + + area_cda = trangle_area(c, d, a) + area_cdb = area_cda + area_abc - area_abd + + if area_cda * area_cdb >= 0: + return False + t = area_cda / (area_abd - area_abc) + + dx = t * (b[0] - a[0]) + dy = t * (b[1] - a[1]) + temp_pts[0] = a[0] + dx + temp_pts[1] = a[1] + dy + return True + + +@cuda.jit("(float32, float32, float32[:])", device=True, inline=True) +def point_in_quadrilateral( + pt_x: float, # x coordinate of the point + pt_y: float, # y coordinate of the point + corners: cuda.local.array, # corners of the quadrilateral, shape (8,) +) -> bool: + """Check if a point is inside a quadrilateral. + + Args: + pt_x (float): x coordinate of the point + pt_y (float): y coordinate of the point + corners (cuda.local.array): shape (8,), corners of the quadrilateral + + Returns: + bool: True if the point is inside the quadrilateral, False otherwise + """ + ab0 = corners[2] - corners[0] + ab1 = corners[3] - corners[1] + + ad0 = corners[6] - corners[0] + ad1 = corners[7] - corners[1] + + ap0 = pt_x - corners[0] + ap1 = pt_y - corners[1] + + abab = ab0 * ab0 + ab1 * ab1 + abap = ab0 * ap0 + ab1 * ap1 + adad = ad0 * ad0 + ad1 * ad1 + adap = ad0 * ap0 + ad1 * ap1 + + return abab >= abap and abap >= 0 and adad >= adap and adap >= 0 + + +@cuda.jit("(float32[:], float32[:], float32[:])", device=True, inline=True) +def quadrilateral_intersection( + pts1: cuda.local.array, # shape: (8,) + pts2: cuda.local.array, # shape: (8,) + int_pts: cuda.local.array, # shape: (16,) +) -> int: + """Compute the intersection points between two quadrilaterals. + + Args: + pts1(cuda.local.array): Array of points representing the first quadrilateral, shape (8,). + pts2(cuda.local.array): Array of points representing the second quadrilateral, shape (8,). + int_pts(cuda.local.array): Array to store the intersection points, shape (16,). + + Returns: + int: Number of intersection points. + """ + num_of_inter = 0 + for i in range(4): + if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2): + int_pts[num_of_inter * 2] = pts1[2 * i] + int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1] + num_of_inter += 1 + if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1): + int_pts[num_of_inter * 2] = pts2[2 * i] + int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1] + num_of_inter += 1 + temp_pts = cuda.local.array((2,), dtype=numba.float32) + for i in range(4): + for j in range(4): + has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts) + if has_pts: + int_pts[num_of_inter * 2] = temp_pts[0] + int_pts[num_of_inter * 2 + 1] = temp_pts[1] + num_of_inter += 1 + + return num_of_inter + + +@cuda.jit("(float32[:], float32[:])", device=True, inline=True) +def rbbox_to_corners( + corners: cuda.local.array, # shape: (8,) + rbbox: cuda.local.array, # shape: (5,) +) -> None: + """Convert a rotated bounding box to its corner points. + + Args: + corners (cuda.local.array): Array to store the corner points, shape (8,). + rbbox (cuda.local.array): Array representing the rotated bounding box, shape (5,). + The rotated bounding box is represented by (center_x, center_y, width, height, angle). + + Returns: + None + """ + # generate clockwise corners and rotate it clockwise + angle = rbbox[4] + a_cos = math.cos(angle) + a_sin = math.sin(angle) + center_x = rbbox[0] + center_y = rbbox[1] + x_d = rbbox[2] + y_d = rbbox[3] + corners_x = cuda.local.array((4,), dtype=numba.float32) + corners_y = cuda.local.array((4,), dtype=numba.float32) + corners_x[0] = -x_d / 2 + corners_x[1] = -x_d / 2 + corners_x[2] = x_d / 2 + corners_x[3] = x_d / 2 + corners_y[0] = -y_d / 2 + corners_y[1] = y_d / 2 + corners_y[2] = y_d / 2 + corners_y[3] = -y_d / 2 + for i in range(4): + corners[2 * i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x + corners[2 * i + 1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y + + +@cuda.jit("(float32[:], float32[:])", device=True, inline=True) +def inter( + rbbox1: cuda.local.array, # shape: (5,) + rbbox2: cuda.local.array, # shape: (5,) +) -> float: # The intersection area of the two rotated bounding boxes. + """Calculate the intersection area of two rotated bounding boxes. + + Args: + rbbox1 (ndarray): Array representing the first rotated bounding box. + The rotated bounding box is represented by (center_x, center_y, width, height, angle). + rbbox2 (ndarray): Array representing the second rotated bounding box. + The rotated bounding box is represented by (center_x, center_y, width, height, angle). + + Returns: + float: The intersection area of the two rotated bounding boxes. + """ + corners1 = cuda.local.array((8,), dtype=numba.float32) + corners2 = cuda.local.array((8,), dtype=numba.float32) + intersection_corners = cuda.local.array((16,), dtype=numba.float32) + + rbbox_to_corners(corners1, rbbox1) + rbbox_to_corners(corners2, rbbox2) + + num_intersection = quadrilateral_intersection(corners1, corners2, intersection_corners) + sort_vertex_in_convex_polygon(intersection_corners, num_intersection) + # print(intersection_corners.reshape([-1, 2])[:num_intersection]) + + return area(intersection_corners, num_intersection) + + +@cuda.jit("(float32[:], float32[:], int32)", device=True, inline=True) +def dev_rotate_iou_eval( + rbox1: cuda.shared.array, # shape: (5,) + rbox2: cuda.shared.array, # shape: (5,) + criterion: int = -1, # IoU criterion to use. Defaults to -1. +) -> float: # The IoU of the two rotated bounding boxes. + """Calculate the IoU of two rotated bounding boxes. + + Args: + rbox1 (cuda.shared.array): Array representing the first rotated bounding box. + The rotated bounding box is represented by (center_x, center_y, width, height, angle). + rbox2 (cuda.shared.array): Array representing the second rotated bounding box. + The rotated bounding box is represented by (center_x, center_y, width, height, angle). + criterion (int): The method to calculate the IoU. + -1: Calculate the IoU. + 0: Calculate the IoU with first box as the reference. + 1: Calculate the IoU with second box as the reference. + + Returns: + float: The IoU of the two rotated bounding boxes. + """ + area1 = rbox1[2] * rbox1[3] + area2 = rbox2[2] * rbox2[3] + area_inter = inter(rbox1, rbox2) + if criterion == -1: + return area_inter / (area1 + area2 - area_inter) + if criterion == 0: + return area_inter / area1 + if criterion == 1: + return area_inter / area2 + return area_inter + + +@cuda.jit("(int64, int64, float32[:], float32[:], float32[:], int32)", fastmath=False) +def rotate_iou_kernel_eval( + n: int, + k: int, + dev_boxes: cuda.shared.array, + dev_query_boxes: cuda.shared.array, + dev_iou: cuda.shared.array, + criterion: int = -1, +) -> None: + """Calculate the IoU of two rotated bounding boxes. + + Args: + N (int): Number of boxes. + K (int): Number of query boxes. + dev_boxes (cuda.shared.array): Array representing the boxes. + dev_query_boxes (cuda.shared.array): Array representing the query boxes. + dev_iou (cuda.shared.array): Array to store the IoU values. + criterion (int): The method to calculate the IoU. + -1: Calculate the IoU. + 0: Calculate the IoU with the first box as the reference. + 1: Calculate the IoU with the second box as the reference. + + """ + threads_per_block = 8 * 8 + row_start = cuda.blockIdx.x + col_start = cuda.blockIdx.y + tx = cuda.threadIdx.x + row_size = min(n - row_start * threads_per_block, threads_per_block) + col_size = min(k - col_start * threads_per_block, threads_per_block) + block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32) + block_qboxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32) + + dev_query_box_idx = threads_per_block * col_start + tx + dev_box_idx = threads_per_block * row_start + tx + if tx < col_size: + block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0] + block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1] + block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2] + block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3] + block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4] + if tx < row_size: + block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0] + block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1] + block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2] + block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3] + block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4] + cuda.syncthreads() + if tx < row_size: + for i in range(col_size): + offset = row_start * threads_per_block * k + col_start * threads_per_block + tx * k + i + dev_iou[offset] = dev_rotate_iou_eval( + block_qboxes[i * 5 : i * 5 + 5], + block_boxes[tx * 5 : tx * 5 + 5], + criterion, + ) + + +def rotate_iou_eval_gpu( + boxes: np.ndarray, # shape: (n, 5) + query_boxes: np.ndarray, # shape: (k, 5) + criterion: int = -1, # IoU criterion to use. Defaults to -1. + device_id: int = 0, +) -> np.ndarray: # shape: (n, k) + """Compute the rotated box IoU between two sets of boxes on CPU. + + Args: + boxes (ndarray): Array of shape (n, 5) representing n rotated boxes. + Each box is represented by (center_x, center_y, width, height, angle). + query_boxes (ndarray): Array of shape (k, 5) representing k query rotated boxes. + Each query box is represented by (center_x, center_y, width, height, angle). + criterion (int, optional): IoU criterion to use. Defaults to -1. + + Returns: + ndarray: Array of shape (n, k) representing the IoU between each pair of boxes. + """ + boxes = boxes.astype(np.float32) + query_boxes = query_boxes.astype(np.float32) + n = boxes.shape[0] + k = query_boxes.shape[0] + iou = np.zeros((n, k), dtype=np.float32) + if n == 0 or k == 0: + return iou + threads_per_block = 8 * 8 + cuda.select_device(device_id) + blockspergrid = (div_up(n, threads_per_block), div_up(k, threads_per_block)) + + stream = cuda.stream() + with stream.auto_synchronize(): + boxes_dev = cuda.to_device(boxes.reshape([-1]), stream) + query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream) + iou_dev = cuda.to_device(iou.reshape([-1]), stream) + rotate_iou_kernel_eval[blockspergrid, threads_per_block, stream]( + n, + k, + boxes_dev, + query_boxes_dev, + iou_dev, + criterion, + ) + iou_dev.copy_to_host(iou.reshape([-1]), stream=stream) + return iou.astype(boxes.dtype) diff --git a/src/otx/core/metrics/kitti_3d_eval/rotate_iou.py b/src/otx/core/metrics/kitti_3d_eval/rotate_iou.py new file mode 100644 index 00000000000..3458b6e1261 --- /dev/null +++ b/src/otx/core/metrics/kitti_3d_eval/rotate_iou.py @@ -0,0 +1,429 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Rotate IoU for KITTI3D metric, cpu version.""" + +import math + +import numba +import numpy as np + + +@numba.jit(nopython=True) +def div_up(m: int, n: int) -> int: + """Divide m by n and round up to the nearest integer. + + Args: + m (int): Numerator. + n (int): Denominator. + + Returns: + int: Result of the division rounded up to the nearest integer. + """ + return m // n + (m % n > 0) + + +@numba.jit(nopython=True, inline="always") +def trangle_area(a: np.ndarray, b: np.ndarray, c: np.ndarray) -> float: + """Calculate the area of a triangle given its three vertices. + + Args: + a (ndarray): First vertex of the triangle. + b (ndarray): Second vertex of the triangle. + c (ndarray): Third vertex of the triangle. + + Returns: + float: Area of the triangle. + """ + return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) * (b[0] - c[0])) / 2.0 + + +@numba.jit(nopython=True, inline="always") +def area(int_pts: np.ndarray, num_of_inter: int) -> float: + """Calculate the area of a polygon using the given intersection points. + + Args: + int_pts (ndarray): Array of intersection points, shape (num_of_inter * 2,). + num_of_inter (int): Number of intersection points. + + Returns: + float: The calculated area of the polygon. + """ + area_val: float = 0.0 + for i in range(num_of_inter - 2): + area_val += abs( + trangle_area( + int_pts[:2], + int_pts[2 * i + 2 : 2 * i + 4], + int_pts[2 * i + 4 : 2 * i + 6], + ), + ) + return area_val + + +@numba.jit(nopython=True, inline="always") +def sort_vertex_in_convex_polygon(int_pts: np.ndarray, num_of_inter: int) -> None: + """Sort the vertices of a convex polygon in counterclockwise order. + + Args: + int_pts: Array of intersection points. + num_of_inter: Number of intersection points. + """ + if num_of_inter > 0: + center = np.empty((2,), dtype=np.float32) + center[:] = 0.0 + for i in range(num_of_inter): + center[0] += int_pts[2 * i] + center[1] += int_pts[2 * i + 1] + center[0] /= num_of_inter + center[1] /= num_of_inter + v = np.empty((2,), dtype=np.float32) + vs = np.empty((16,), dtype=np.float32) + for i in range(num_of_inter): + v[0] = int_pts[2 * i] - center[0] + v[1] = int_pts[2 * i + 1] - center[1] + d = math.sqrt(v[0] * v[0] + v[1] * v[1]) + v[0] = v[0] / d + v[1] = v[1] / d + if v[1] < 0: + v[0] = -2 - v[0] + vs[i] = v[0] + j = 0 + temp = 0 + for i in range(1, num_of_inter): + if vs[i - 1] > vs[i]: + temp = vs[i] + tx = int_pts[2 * i] + ty = int_pts[2 * i + 1] + j = i + while j > 0 and vs[j - 1] > temp: + vs[j] = vs[j - 1] + int_pts[j * 2] = int_pts[j * 2 - 2] + int_pts[j * 2 + 1] = int_pts[j * 2 - 1] + j -= 1 + + vs[j] = temp + int_pts[j * 2] = tx + int_pts[j * 2 + 1] = ty + + +@numba.jit(nopython=True, inline="always") +def line_segment_intersection( + pts1: np.ndarray, # array of points representing the first line segment + pts2: np.ndarray, # array of points representing the second line segment + i: int, # index of the first line segment + j: int, # index of the second line segment + temp_pts: np.ndarray, # array to store the intersection point +) -> bool: + """Check if two line segments intersect and find the intersection point. + + Args: + pts1 (ndarray): Array of points representing the first line segment. + pts2 (ndarray): Array of points representing the second line segment. + i (int): Index of the first line segment. + j (int): Index of the second line segment. + temp_pts (ndarray): Array to store the intersection point. + + Returns: + bool: True if the line segments intersect, False otherwise. + """ + a = np.empty((2,), dtype=np.float32) + b = np.empty((2,), dtype=np.float32) + c = np.empty((2,), dtype=np.float32) + d = np.empty((2,), dtype=np.float32) + + a[0] = pts1[2 * i] + a[1] = pts1[2 * i + 1] + + b[0] = pts1[2 * ((i + 1) % 4)] + b[1] = pts1[2 * ((i + 1) % 4) + 1] + + c[0] = pts2[2 * j] + c[1] = pts2[2 * j + 1] + + d[0] = pts2[2 * ((j + 1) % 4)] + d[1] = pts2[2 * ((j + 1) % 4) + 1] + + ba0 = b[0] - a[0] + ba1 = b[1] - a[1] + da0 = d[0] - a[0] + ca0 = c[0] - a[0] + da1 = d[1] - a[1] + ca1 = c[1] - a[1] + + acd = da1 * ca0 > ca1 * da0 + bcd = (d[1] - b[1]) * (c[0] - b[0]) > (c[1] - b[1]) * (d[0] - b[0]) + if acd != bcd: + abc = ca1 * ba0 > ba1 * ca0 + abd = da1 * ba0 > ba1 * da0 + if abc != abd: + dc0 = d[0] - c[0] + dc1 = d[1] - c[1] + abba = a[0] * b[1] - b[0] * a[1] + cddc = c[0] * d[1] - d[0] * c[1] + dh = ba1 * dc0 - ba0 * dc1 + dx = abba * dc0 - ba0 * cddc + dy = abba * dc1 - ba1 * cddc + temp_pts[0] = dx / dh + temp_pts[1] = dy / dh + return True + return False + + +@numba.jit(nopython=True, inline="always") +def line_segment_intersection_v1( + pts1: np.ndarray, # array of points representing the first line segment + pts2: np.ndarray, # array of points representing the second line segment + i: int, # index of the first line segment + j: int, # index of the second line segment + temp_pts: np.ndarray, # array to store the intersection point +) -> bool: + """Check if two line segments intersect and find the intersection point using an alternative method. + + Args: + pts1: ndarray, array of points representing the first line segment + pts2: ndarray, array of points representing the second line segment + i: int, index of the first line segment + j: int, index of the second line segment + temp_pts: ndarray, array to store the intersection point + + Returns: + bool: True if the line segments intersect, False otherwise + """ + a = np.empty((2,), dtype=np.float32) + b = np.empty((2,), dtype=np.float32) + c = np.empty((2,), dtype=np.float32) + d = np.empty((2,), dtype=np.float32) + + a[0] = pts1[2 * i] + a[1] = pts1[2 * i + 1] + + b[0] = pts1[2 * ((i + 1) % 4)] + b[1] = pts1[2 * ((i + 1) % 4) + 1] + + c[0] = pts2[2 * j] + c[1] = pts2[2 * j + 1] + + d[0] = pts2[2 * ((j + 1) % 4)] + d[1] = pts2[2 * ((j + 1) % 4) + 1] + + area_abc = trangle_area(a, b, c) + area_abd = trangle_area(a, b, d) + + if area_abc * area_abd >= 0: + return False + + area_cda = trangle_area(c, d, a) + area_cdb = area_cda + area_abc - area_abd + + if area_cda * area_cdb >= 0: + return False + t = area_cda / (area_abd - area_abc) + + dx = t * (b[0] - a[0]) + dy = t * (b[1] - a[1]) + temp_pts[0] = a[0] + dx + temp_pts[1] = a[1] + dy + return True + + +@numba.jit(nopython=True, inline="always") +def point_in_quadrilateral( + pt_x: float, # x coordinate of the point + pt_y: float, # y coordinate of the point + corners: np.ndarray, # corners of the quadrilateral, shape (8,) +) -> bool: + """Check if a point is inside a quadrilateral. + + Args: + pt_x: float, x coordinate of the point + pt_y: float, y coordinate of the point + corners: ndarray, shape (8,), corners of the quadrilateral + + Returns: + bool: True if the point is inside the quadrilateral, False otherwise + """ + ab0 = corners[2] - corners[0] + ab1 = corners[3] - corners[1] + + ad0 = corners[6] - corners[0] + ad1 = corners[7] - corners[1] + + ap0 = pt_x - corners[0] + ap1 = pt_y - corners[1] + + abab = ab0 * ab0 + ab1 * ab1 + abap = ab0 * ap0 + ab1 * ap1 + adad = ad0 * ad0 + ad1 * ad1 + adap = ad0 * ap0 + ad1 * ap1 + + return abab >= abap and abap >= 0 and adad >= adap and adap >= 0 + + +@numba.jit(nopython=True, inline="always") +def quadrilateral_intersection( + pts1: np.ndarray, # shape: (8,) + pts2: np.ndarray, # shape: (8,) + int_pts: np.ndarray, # shape: (16,) +) -> int: + """Compute the intersection points between two quadrilaterals. + + Args: + pts1: Array of points representing the first quadrilateral, shape (8,). + pts2: Array of points representing the second quadrilateral, shape (8,). + int_pts: Array to store the intersection points, shape (16,). + + Returns: + int: Number of intersection points. + """ + num_of_inter = 0 + for i in range(4): + if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2): + int_pts[num_of_inter * 2] = pts1[2 * i] + int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1] + num_of_inter += 1 + if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1): + int_pts[num_of_inter * 2] = pts2[2 * i] + int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1] + num_of_inter += 1 + temp_pts = np.empty((2,), dtype=np.float32) + for i in range(4): + for j in range(4): + has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts) + if has_pts: + int_pts[num_of_inter * 2] = temp_pts[0] + int_pts[num_of_inter * 2 + 1] = temp_pts[1] + num_of_inter += 1 + + return num_of_inter + + +@numba.jit(nopython=True, inline="always") +def rbbox_to_corners( + corners: np.ndarray, # shape: (8,) + rbbox: np.ndarray, # shape: (5,) +) -> None: + """Convert a rotated bounding box to its corner points. + + Args: + corners (ndarray): Array to store the corner points, shape (8,). + rbbox (ndarray): Array representing the rotated bounding box, shape (5,). + The rotated bounding box is represented by (center_x, center_y, width, height, angle). + + Returns: + None + """ + # generate clockwise corners and rotate it clockwise + angle = rbbox[4] + a_cos = math.cos(angle) + a_sin = math.sin(angle) + center_x = rbbox[0] + center_y = rbbox[1] + x_d = rbbox[2] + y_d = rbbox[3] + corners_x = np.empty((4,), dtype=np.float32) + corners_y = np.empty((4,), dtype=np.float32) + corners_x[0] = -x_d / 2 + corners_x[1] = -x_d / 2 + corners_x[2] = x_d / 2 + corners_x[3] = x_d / 2 + corners_y[0] = -y_d / 2 + corners_y[1] = y_d / 2 + corners_y[2] = y_d / 2 + corners_y[3] = -y_d / 2 + for i in range(4): + corners[2 * i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x + corners[2 * i + 1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y + + +@numba.jit(nopython=True, inline="always") +def inter( + rbbox1: np.ndarray, # shape: (5,) + rbbox2: np.ndarray, # shape: (5,) +) -> float: # The intersection area of the two rotated bounding boxes. + """Calculate the intersection area of two rotated bounding boxes. + + Args: + rbbox1 (ndarray): Array representing the first rotated bounding box. + The rotated bounding box is represented by (center_x, center_y, width, height, angle). + rbbox2 (ndarray): Array representing the second rotated bounding box. + The rotated bounding box is represented by (center_x, center_y, width, height, angle). + + Returns: + float: The intersection area of the two rotated bounding boxes. + """ + corners1 = np.empty((8,), dtype=np.float32) + corners2 = np.empty((8,), dtype=np.float32) + intersection_corners = np.empty((16,), dtype=np.float32) + + rbbox_to_corners(corners1, rbbox1) + rbbox_to_corners(corners2, rbbox2) + + num_intersection = quadrilateral_intersection(corners1, corners2, intersection_corners) + sort_vertex_in_convex_polygon(intersection_corners, num_intersection) + + return area(intersection_corners, num_intersection) + + +@numba.jit(nopython=True, inline="always") +def dev_rotate_iou_eval( + rbox1: np.ndarray, # shape: (5,) + rbox2: np.ndarray, # shape: (5,) + criterion: int = -1, # IoU criterion to use. Defaults to -1. +) -> float: # The IoU of the two rotated bounding boxes. + """Calculate the IoU of two rotated bounding boxes. + + Args: + rbox1 (ndarray): Array representing the first rotated bounding box. + The rotated bounding box is represented by (center_x, center_y, width, height, angle). + rbox2 (ndarray): Array representing the second rotated bounding box. + The rotated bounding box is represented by (center_x, center_y, width, height, angle). + criterion (int): The method to calculate the IoU. + -1: Calculate the IoU. + 0: Calculate the IoU with first box as the reference. + 1: Calculate the IoU with second box as the reference. + + Returns: + float: The IoU of the two rotated bounding boxes. + """ + area1 = rbox1[2] * rbox1[3] + area2 = rbox2[2] * rbox2[3] + area_inter = inter(rbox1, rbox2) + if criterion == -1: + return area_inter / (area1 + area2 - area_inter) + if criterion == 0: + return area_inter / area1 + if criterion == 1: + return area_inter / area2 + return area_inter + + +@numba.jit(nopython=True, inline="always") +def rotate_iou_eval_cpu( + boxes: np.ndarray, # shape: (n, 5) + query_boxes: np.ndarray, # shape: (k, 5) + criterion: int = -1, # IoU criterion to use. Defaults to -1. +) -> np.ndarray: # shape: (n, k) + """Compute the rotated box IoU between two sets of boxes on CPU. + + Args: + boxes (ndarray): Array of shape (n, 5) representing n rotated boxes. + Each box is represented by (center_x, center_y, width, height, angle). + query_boxes (ndarray): Array of shape (k, 5) representing k query rotated boxes. + Each query box is represented by (center_x, center_y, width, height, angle). + criterion (int, optional): IoU criterion to use. Defaults to -1. + + Returns: + ndarray: Array of shape (n, k) representing the IoU between each pair of boxes. + """ + n = boxes.shape[0] + k = query_boxes.shape[0] + iou = np.zeros((n, k), dtype=np.float32) + if n == 0 or k == 0: + return iou + + for i in range(n): + for j in range(k): + iou[i, j] = dev_rotate_iou_eval(boxes[i], query_boxes[j], criterion) + + return iou diff --git a/src/otx/core/model/detection_3d.py b/src/otx/core/model/detection_3d.py new file mode 100644 index 00000000000..caa0d14090f --- /dev/null +++ b/src/otx/core/model/detection_3d.py @@ -0,0 +1,297 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Class definition for 3d object detection model entity used in OTX.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np +import torch +from torchvision.ops import box_convert + +from otx.algo.utils.mmengine_utils import load_checkpoint +from otx.core.data.dataset.utils.kitti_utils import class2angle +from otx.core.data.entity.base import ImageInfo +from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity, Det3DBatchPredEntity +from otx.core.metrics import MetricInput +from otx.core.metrics.average_precision_3d import KittiMetric +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable, OTXModel +from otx.core.types.export import TaskLevelExportParameters + +if TYPE_CHECKING: + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + from torch import nn + + from otx.core.metrics import MetricCallable + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.types.label import LabelInfoTypes + + +class OTX3DDetectionModel(OTXModel[Det3DBatchDataEntity, Det3DBatchPredEntity]): + """Base class for the 3d detection models used in OTX.""" + + mean: tuple[float, float, float] + std: tuple[float, float, float] + load_from: str | None + + def __init__( + self, + label_info: LabelInfoTypes, + model_name: str, + input_size: tuple[int, int], + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = KittiMetric, + torch_compile: bool = False, + score_threshold: float = 0.1, + ) -> None: + """Initialize the 3d detection model.""" + self.model_name = model_name + self.score_threshold = score_threshold + super().__init__( + label_info=label_info, + input_size=input_size, + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + ) + + def _create_model(self) -> nn.Module: + """Creates the model.""" + detector = self._build_model(num_classes=self.label_info.num_classes) + if hasattr(detector, "init_weights"): + detector.init_weights() + self.classification_layers = self.get_classification_layers(prefix="model.") + if self.load_from is not None: + load_checkpoint(detector, self.load_from, map_location="cpu") + return detector + + @property + def _export_parameters(self) -> TaskLevelExportParameters: + """Defines parameters required to export a particular model implementation.""" + return super()._export_parameters.wrap( + model_type="ssd", + task_type="detection", + ) + + def _convert_pred_entity_to_compute_metric( + self, + preds: Det3DBatchPredEntity, + inputs: Det3DBatchDataEntity, + ) -> MetricInput: + """Converts the prediction entity to the format required for computing metrics. + + Args: + preds (Det3DBatchPredEntity): Prediction entity. + inputs (Det3DBatchDataEntity): Input data entity. + """ + boxes = preds.boxes_3d + # bbox 2d decoding + xywh_2d = box_convert(preds.boxes, "xyxy", "cxcywh") + + xs3d = boxes[:, :, 0:1] + ys3d = boxes[:, :, 1:2] + xs2d = xywh_2d[:, :, 0:1] + ys2d = xywh_2d[:, :, 1:2] + + batch = len(boxes) + labels = preds.labels.view(batch, -1, 1) + scores = preds.scores.view(batch, -1, 1) + xs2d = xs2d.view(batch, -1, 1) + ys2d = ys2d.view(batch, -1, 1) + xs3d = xs3d.view(batch, -1, 1) + ys3d = ys3d.view(batch, -1, 1) + + detections = ( + torch.cat( + [ + labels, + scores, + xs2d, + ys2d, + preds.size_2d, + preds.depth[:, :, 0:1], + preds.heading_angle, + preds.size_3d, + xs3d, + ys3d, + torch.exp(-preds.depth[:, :, 1:2]), + ], + dim=2, + ) + .detach() + .cpu() + .numpy() + ) + + img_sizes = np.array([img_info.ori_shape for img_info in inputs.imgs_info]) + calib_matrix = [p2.detach().cpu().numpy() for p2 in inputs.calib_matrix] + result_list = self._decode_detections_for_kitti_format( + detections, + img_sizes, + calib_matrix, + class_names=self.label_info.label_names, + threshold=self.score_threshold, + ) + + return { + "preds": result_list, + "target": inputs.original_kitti_format, # type: ignore[dict-item] + } + + @staticmethod + def _decode_detections_for_kitti_format( + dets: np.ndarray, + img_size: np.ndarray, + calib_matrix: list[np.ndarray], + class_names: list[str], + threshold: float = 0.2, + ) -> list[dict[str, np.ndarray]]: + """Decode the detection results for KITTI format.""" + + def _get_heading_angle(heading: np.ndarray) -> np.ndarray: + """Get heading angle from the prediction.""" + heading_bin, heading_res = heading[0:12], heading[12:24] + cls = np.argmax(heading_bin) + res = heading_res[cls] + return class2angle(cls, res, to_label_format=True) + + def _alpha2ry(calib_matrix: np.ndarray, alpha: np.ndarray, u: np.ndarray) -> np.ndarray: + """Get rotation_y by alpha + theta - 180.""" + cu = calib_matrix[0, 2] + fu = calib_matrix[0, 0] + + ry = alpha + np.arctan2(u - cu, fu) + + if ry > np.pi: + ry -= 2 * np.pi + if ry < -np.pi: + ry += 2 * np.pi + + return ry + + def _img_to_rect(calib_matrix: np.ndarray, u: np.ndarray, v: np.ndarray, depth_rect: np.ndarray) -> np.ndarray: + """Transform image coordinates to the rectangle coordinates.""" + cu = calib_matrix[0, 2] + cv = calib_matrix[1, 2] + fu = calib_matrix[0, 0] + fv = calib_matrix[1, 1] + tx = calib_matrix[0, 3] / (-fu) + ty = calib_matrix[1, 3] / (-fv) + + x = ((u - cu) * depth_rect) / fu + tx + y = ((v - cv) * depth_rect) / fv + ty + return np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1), depth_rect.reshape(-1, 1)), axis=1) + + results = [] + for i in range(dets.shape[0]): # batch + names = [] + alphas = [] + bboxes = [] + dimensions = [] + locations = [] + rotation_y = [] + scores = [] + + for j in range(dets.shape[1]): # max_dets + cls_id = int(dets[i, j, 0]) + score = dets[i, j, 1] + if score < threshold: + continue + + # 2d bboxs decoding + x = dets[i, j, 2] * img_size[i][0] + y = dets[i, j, 3] * img_size[i][1] + w = dets[i, j, 4] * img_size[i][0] + h = dets[i, j, 5] * img_size[i][1] + bbox = [x - w / 2, y - h / 2, x + w / 2, y + h / 2] + + # 3d bboxs decoding + # depth decoding + depth = dets[i, j, 6] + + # dimensions decoding + dimension = dets[i, j, 31:34] + + # positions decoding + x3d = dets[i, j, 34] * img_size[i][0] + y3d = dets[i, j, 35] * img_size[i][1] + location = _img_to_rect(calib_matrix[i], x3d, y3d, depth).reshape(-1) + location[1] += dimension[0] / 2 + + # heading angle decoding + alpha = dets[i, j, 7:31] + alpha = _get_heading_angle(dets[i, j, 7:31]) + ry = _alpha2ry(calib_matrix[i], alpha, x) + + names.append(class_names[cls_id]) + alphas.append(alpha) + bboxes.append(bbox) + dimensions.append(np.array([dimension[2], dimension[0], dimension[1]])) + locations.append(location) + rotation_y.append(ry) + scores.append(score) + + results.append( + { + "name": np.array(names), + "alpha": np.array(alphas), + "bbox": np.array(bboxes).reshape(-1, 4), + "dimensions": np.array(dimensions).reshape(-1, 3), + "location": np.array(locations).reshape(-1, 3), + "rotation_y": np.array(rotation_y), + "score": np.array(scores), + }, + ) + + return results + + def get_dummy_input(self, batch_size: int = 1) -> Det3DBatchDataEntity: + """Returns a dummy input for 3d object detection model.""" + if self.input_size is None: + msg = f"Input size attribute is not set for {self.__class__}" + raise ValueError(msg) + + images = [torch.rand(3, *self.input_size) for _ in range(batch_size)] + calib_matrix = [torch.rand(3, 4) for _ in range(batch_size)] + infos = [] + for i, img in enumerate(images): + infos.append( + ImageInfo( + img_idx=i, + img_shape=img.shape, + ori_shape=img.shape, + ), + ) + return Det3DBatchDataEntity( + batch_size, + images, + infos, + boxes=[], + labels=[], + calib_matrix=calib_matrix, + boxes_3d=[], + size_2d=[], + size_3d=[], + depth=[], + heading_angle=[], + original_kitti_format=[], + ) + + def get_classification_layers(self, prefix: str = "model.") -> dict[str, dict[str, int]]: + """Get final classification layer information for incremental learning case.""" + sample_model_dict = self._build_model(num_classes=5).state_dict() + incremental_model_dict = self._build_model(num_classes=6).state_dict() + + classification_layers = {} + for key in sample_model_dict: + if sample_model_dict[key].shape != incremental_model_dict[key].shape: + sample_model_dim = sample_model_dict[key].shape[0] + incremental_model_dim = incremental_model_dict[key].shape[0] + stride = incremental_model_dim - sample_model_dim + num_extra_classes = 6 * sample_model_dim - 5 * incremental_model_dim + classification_layers[prefix + key] = {"stride": stride, "num_extra_classes": num_extra_classes} + return classification_layers diff --git a/src/otx/core/types/task.py b/src/otx/core/types/task.py index ed5b893020a..ddfc10f33ab 100644 --- a/src/otx/core/types/task.py +++ b/src/otx/core/types/task.py @@ -31,6 +31,7 @@ class OTXTaskType(str, Enum): ROTATED_DETECTION = "ROTATED_DETECTION" DETECTION_SEMI_SL = "DETECTION_SEMI_SL" KEYPOINT_DETECTION = "KEYPOINT_DETECTION" + OBJECT_DETECTION_3D = "OBJECT_DETECTION_3D" # Segmentation INSTANCE_SEGMENTATION = "INSTANCE_SEGMENTATION" diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py index 16fb530610c..79459b66f9a 100644 --- a/src/otx/engine/utils/auto_configurator.py +++ b/src/otx/engine/utils/auto_configurator.py @@ -48,6 +48,7 @@ OTXTaskType.VISUAL_PROMPTING: RECIPE_PATH / "visual_prompting" / "sam_tiny_vit.yaml", OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING: RECIPE_PATH / "zero_shot_visual_prompting" / "sam_tiny_vit.yaml", OTXTaskType.KEYPOINT_DETECTION: RECIPE_PATH / "keypoint_detection" / "rtmpose_tiny.yaml", + OTXTaskType.OBJECT_DETECTION_3D: RECIPE_PATH / "object_detection_3d" / "monodetr3d.yaml", } TASK_PER_DATA_FORMAT = { diff --git a/src/otx/recipe/_base_/data/object_detection_3d.yaml b/src/otx/recipe/_base_/data/object_detection_3d.yaml new file mode 100644 index 00000000000..a7c773f1bcf --- /dev/null +++ b/src/otx/recipe/_base_/data/object_detection_3d.yaml @@ -0,0 +1,52 @@ +task: OBJECT_DETECTION_3D +input_size: + - 384 + - 1280 +mem_cache_size: 1GB +mem_cache_img_max_size: null +image_color_channel: RGB +stack_images: true +data_format: kitti3d +unannotated_items_ratio: 0.0 +train_subset: + subset_name: train + transform_lib_type: TORCHVISION + batch_size: 8 + num_workers: 4 + to_tv_image: false + transforms: + - class_path: torchvision.transforms.v2.Normalize + init_args: + mean: [123.675, 116.28, 103.53] + std: [58.395, 57.12, 57.375] + + sampler: + class_path: torch.utils.data.RandomSampler + +val_subset: + subset_name: val + transform_lib_type: TORCHVISION + batch_size: 16 + num_workers: 4 + to_tv_image: false + transforms: + - class_path: torchvision.transforms.v2.Normalize + init_args: + mean: [123.675, 116.28, 103.53] + std: [58.395, 57.12, 57.375] + sampler: + class_path: torch.utils.data.RandomSampler + +test_subset: + subset_name: test + transform_lib_type: TORCHVISION + batch_size: 16 + num_workers: 4 + to_tv_image: false + transforms: + - class_path: torchvision.transforms.v2.Normalize + init_args: + mean: [123.675, 116.28, 103.53] + std: [58.395, 57.12, 57.375] + sampler: + class_path: torch.utils.data.RandomSampler diff --git a/src/otx/recipe/object_detection_3d/monodetr3d.yaml b/src/otx/recipe/object_detection_3d/monodetr3d.yaml new file mode 100644 index 00000000000..032c71ffbf8 --- /dev/null +++ b/src/otx/recipe/object_detection_3d/monodetr3d.yaml @@ -0,0 +1,44 @@ +model: + class_path: otx.algo.object_detection_3d.monodetr3d.MonoDETR3D + init_args: + label_info: 17 + model_name: monodetr_50 + input_size: + - 384 + - 1280 + + optimizer: + class_path: torch.optim.AdamW + init_args: + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + scheduler: + class_path: lightning.pytorch.cli.ReduceLROnPlateau + init_args: + mode: max + factor: 0.1 + patience: 13 + monitor: val/mAP_bbox_2d + +engine: + task: OBJECT_DETECTION_3D + device: auto + +callback_monitor: val/mAP_bbox_3d + +data: ../_base_/data/object_detection_3d.yaml + +precision: 32 # MonoDETR do not support fp16 training +overrides: + callbacks: + - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup + init_args: + monitor: null + mode: max + patience: 15 + check_on_train_epoch_end: false + min_delta: 0.001 + warmup_iters: 30 + warmup_epochs: 30 diff --git a/tests/assets/kitti3d/calib/test/000023.txt b/tests/assets/kitti3d/calib/test/000023.txt new file mode 100755 index 00000000000..f8a223dbf17 --- /dev/null +++ b/tests/assets/kitti3d/calib/test/000023.txt @@ -0,0 +1,8 @@ +P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03 +P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03 +R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01 +Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01 +Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01 + diff --git a/tests/assets/kitti3d/calib/test/000025.txt b/tests/assets/kitti3d/calib/test/000025.txt new file mode 100755 index 00000000000..f8a223dbf17 --- /dev/null +++ b/tests/assets/kitti3d/calib/test/000025.txt @@ -0,0 +1,8 @@ +P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03 +P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03 +R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01 +Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01 +Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01 + diff --git a/tests/assets/kitti3d/calib/test/000037.txt b/tests/assets/kitti3d/calib/test/000037.txt new file mode 100755 index 00000000000..dd653b08224 --- /dev/null +++ b/tests/assets/kitti3d/calib/test/000037.txt @@ -0,0 +1,8 @@ +P0: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 0.000000000000e+00 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P1: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 -3.861448000000e+02 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P2: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 4.538225000000e+01 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 -1.130887000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 3.779761000000e-03 +P3: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 -3.372877000000e+02 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 2.369057000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 4.915215000000e-03 +R0_rect: 9.999454000000e-01 7.259129000000e-03 -7.519551000000e-03 -7.292213000000e-03 9.999638000000e-01 -4.381729000000e-03 7.487471000000e-03 4.436324000000e-03 9.999621000000e-01 +Tr_velo_to_cam: 7.967514000000e-03 -9.999679000000e-01 -8.462264000000e-04 -1.377769000000e-02 -2.771053000000e-03 8.241710000000e-04 -9.999958000000e-01 -5.542117000000e-02 9.999644000000e-01 7.969825000000e-03 -2.764397000000e-03 -2.918589000000e-01 +Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01 + diff --git a/tests/assets/kitti3d/calib/train/000003.txt b/tests/assets/kitti3d/calib/train/000003.txt new file mode 100755 index 00000000000..f8a223dbf17 --- /dev/null +++ b/tests/assets/kitti3d/calib/train/000003.txt @@ -0,0 +1,8 @@ +P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03 +P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03 +R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01 +Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01 +Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01 + diff --git a/tests/assets/kitti3d/calib/train/000011.txt b/tests/assets/kitti3d/calib/train/000011.txt new file mode 100755 index 00000000000..f8a223dbf17 --- /dev/null +++ b/tests/assets/kitti3d/calib/train/000011.txt @@ -0,0 +1,8 @@ +P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03 +P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03 +R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01 +Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01 +Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01 + diff --git a/tests/assets/kitti3d/calib/train/000036.txt b/tests/assets/kitti3d/calib/train/000036.txt new file mode 100755 index 00000000000..f8a223dbf17 --- /dev/null +++ b/tests/assets/kitti3d/calib/train/000036.txt @@ -0,0 +1,8 @@ +P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03 +P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03 +R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01 +Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01 +Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01 + diff --git a/tests/assets/kitti3d/calib/train/000046.txt b/tests/assets/kitti3d/calib/train/000046.txt new file mode 100755 index 00000000000..f8a223dbf17 --- /dev/null +++ b/tests/assets/kitti3d/calib/train/000046.txt @@ -0,0 +1,8 @@ +P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03 +P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03 +R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01 +Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01 +Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01 + diff --git a/tests/assets/kitti3d/calib/train/000055.txt b/tests/assets/kitti3d/calib/train/000055.txt new file mode 100755 index 00000000000..f8a223dbf17 --- /dev/null +++ b/tests/assets/kitti3d/calib/train/000055.txt @@ -0,0 +1,8 @@ +P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03 +P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03 +R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01 +Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01 +Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01 + diff --git a/tests/assets/kitti3d/calib/val/000023.txt b/tests/assets/kitti3d/calib/val/000023.txt new file mode 100755 index 00000000000..f8a223dbf17 --- /dev/null +++ b/tests/assets/kitti3d/calib/val/000023.txt @@ -0,0 +1,8 @@ +P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03 +P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03 +R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01 +Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01 +Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01 + diff --git a/tests/assets/kitti3d/calib/val/000025.txt b/tests/assets/kitti3d/calib/val/000025.txt new file mode 100755 index 00000000000..f8a223dbf17 --- /dev/null +++ b/tests/assets/kitti3d/calib/val/000025.txt @@ -0,0 +1,8 @@ +P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03 +P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03 +R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01 +Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01 +Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01 + diff --git a/tests/assets/kitti3d/calib/val/000037.txt b/tests/assets/kitti3d/calib/val/000037.txt new file mode 100755 index 00000000000..dd653b08224 --- /dev/null +++ b/tests/assets/kitti3d/calib/val/000037.txt @@ -0,0 +1,8 @@ +P0: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 0.000000000000e+00 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P1: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 -3.861448000000e+02 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P2: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 4.538225000000e+01 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 -1.130887000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 3.779761000000e-03 +P3: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 -3.372877000000e+02 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 2.369057000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 4.915215000000e-03 +R0_rect: 9.999454000000e-01 7.259129000000e-03 -7.519551000000e-03 -7.292213000000e-03 9.999638000000e-01 -4.381729000000e-03 7.487471000000e-03 4.436324000000e-03 9.999621000000e-01 +Tr_velo_to_cam: 7.967514000000e-03 -9.999679000000e-01 -8.462264000000e-04 -1.377769000000e-02 -2.771053000000e-03 8.241710000000e-04 -9.999958000000e-01 -5.542117000000e-02 9.999644000000e-01 7.969825000000e-03 -2.764397000000e-03 -2.918589000000e-01 +Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01 + diff --git a/tests/assets/kitti3d/image_2/test/000023.png b/tests/assets/kitti3d/image_2/test/000023.png new file mode 100755 index 00000000000..416119c3ba8 Binary files /dev/null and b/tests/assets/kitti3d/image_2/test/000023.png differ diff --git a/tests/assets/kitti3d/image_2/test/000025.png b/tests/assets/kitti3d/image_2/test/000025.png new file mode 100755 index 00000000000..b9003e24f2f Binary files /dev/null and b/tests/assets/kitti3d/image_2/test/000025.png differ diff --git a/tests/assets/kitti3d/image_2/test/000037.png b/tests/assets/kitti3d/image_2/test/000037.png new file mode 100755 index 00000000000..deb48e43602 Binary files /dev/null and b/tests/assets/kitti3d/image_2/test/000037.png differ diff --git a/tests/assets/kitti3d/image_2/train/000003.png b/tests/assets/kitti3d/image_2/train/000003.png new file mode 100755 index 00000000000..206f5703776 Binary files /dev/null and b/tests/assets/kitti3d/image_2/train/000003.png differ diff --git a/tests/assets/kitti3d/image_2/train/000011.png b/tests/assets/kitti3d/image_2/train/000011.png new file mode 100755 index 00000000000..2afc9262872 Binary files /dev/null and b/tests/assets/kitti3d/image_2/train/000011.png differ diff --git a/tests/assets/kitti3d/image_2/train/000036.png b/tests/assets/kitti3d/image_2/train/000036.png new file mode 100755 index 00000000000..0b76bea0c4f Binary files /dev/null and b/tests/assets/kitti3d/image_2/train/000036.png differ diff --git a/tests/assets/kitti3d/image_2/train/000046.png b/tests/assets/kitti3d/image_2/train/000046.png new file mode 100755 index 00000000000..c61025ab22f Binary files /dev/null and b/tests/assets/kitti3d/image_2/train/000046.png differ diff --git a/tests/assets/kitti3d/image_2/train/000055.png b/tests/assets/kitti3d/image_2/train/000055.png new file mode 100755 index 00000000000..512d9092818 Binary files /dev/null and b/tests/assets/kitti3d/image_2/train/000055.png differ diff --git a/tests/assets/kitti3d/image_2/val/000023.png b/tests/assets/kitti3d/image_2/val/000023.png new file mode 100755 index 00000000000..416119c3ba8 Binary files /dev/null and b/tests/assets/kitti3d/image_2/val/000023.png differ diff --git a/tests/assets/kitti3d/image_2/val/000025.png b/tests/assets/kitti3d/image_2/val/000025.png new file mode 100755 index 00000000000..b9003e24f2f Binary files /dev/null and b/tests/assets/kitti3d/image_2/val/000025.png differ diff --git a/tests/assets/kitti3d/image_2/val/000037.png b/tests/assets/kitti3d/image_2/val/000037.png new file mode 100755 index 00000000000..deb48e43602 Binary files /dev/null and b/tests/assets/kitti3d/image_2/val/000037.png differ diff --git a/tests/assets/kitti3d/label_2/test/000023.txt b/tests/assets/kitti3d/label_2/test/000023.txt new file mode 100644 index 00000000000..515e01f2abf --- /dev/null +++ b/tests/assets/kitti3d/label_2/test/000023.txt @@ -0,0 +1 @@ +Car 0.00 0 1.86 372.95 182.64 412.21 205.68 1.67 1.87 3.69 -16.57 2.43 55.08 1.57 diff --git a/tests/assets/kitti3d/label_2/test/000025.txt b/tests/assets/kitti3d/label_2/test/000025.txt new file mode 100644 index 00000000000..70b2887dd78 --- /dev/null +++ b/tests/assets/kitti3d/label_2/test/000025.txt @@ -0,0 +1,5 @@ +Car 0.94 3 -2.10 896.11 218.17 1241.00 374.00 1.39 1.44 3.08 2.43 1.68 3.14 -1.49 +Car 0.00 0 -1.29 351.84 183.19 537.77 308.64 1.47 1.60 3.66 -2.21 1.63 10.42 -1.49 +Car 0.00 0 1.75 562.48 173.46 618.49 217.36 1.70 1.63 4.08 -0.78 1.75 30.18 1.72 +Car 0.00 0 -1.69 724.21 178.91 805.39 249.94 1.59 1.59 2.47 3.64 1.75 17.48 -1.49 +Car 0.00 1 -1.62 720.81 187.01 779.98 236.22 1.37 1.59 3.22 4.23 1.83 22.30 -1.44 diff --git a/tests/assets/kitti3d/label_2/test/000037.txt b/tests/assets/kitti3d/label_2/test/000037.txt new file mode 100644 index 00000000000..49043194d9f --- /dev/null +++ b/tests/assets/kitti3d/label_2/test/000037.txt @@ -0,0 +1,2 @@ +Car 0.00 0 -1.57 555.85 173.64 628.69 240.05 1.60 1.76 3.84 -0.45 1.32 19.43 -1.59 +Car 0.00 2 -1.46 473.07 183.18 535.73 226.04 1.30 1.61 4.39 -3.46 1.24 24.49 -1.60 diff --git a/tests/assets/kitti3d/label_2/train/000003.txt b/tests/assets/kitti3d/label_2/train/000003.txt new file mode 100644 index 00000000000..58ea56c4482 --- /dev/null +++ b/tests/assets/kitti3d/label_2/train/000003.txt @@ -0,0 +1 @@ +Car 0.00 0 1.55 614.24 181.78 727.31 284.77 1.57 1.73 4.15 1.00 1.75 13.22 1.62 diff --git a/tests/assets/kitti3d/label_2/train/000011.txt b/tests/assets/kitti3d/label_2/train/000011.txt new file mode 100644 index 00000000000..a9c93631daa --- /dev/null +++ b/tests/assets/kitti3d/label_2/train/000011.txt @@ -0,0 +1,2 @@ +Car 0.00 0 1.74 444.29 171.04 504.95 225.82 1.86 1.57 3.83 -4.95 1.83 26.64 1.55 +Car 0.98 0 2.42 0.00 217.12 85.92 374.00 1.50 1.46 3.70 -5.12 1.85 4.13 1.56 diff --git a/tests/assets/kitti3d/label_2/train/000036.txt b/tests/assets/kitti3d/label_2/train/000036.txt new file mode 100644 index 00000000000..267b6774cc2 --- /dev/null +++ b/tests/assets/kitti3d/label_2/train/000036.txt @@ -0,0 +1,7 @@ +Car 0.00 0 -1.58 553.16 178.73 693.67 311.88 1.55 1.63 3.32 0.11 1.64 10.13 -1.57 +Car 0.00 0 1.89 341.05 194.66 390.28 218.73 1.37 1.82 3.96 -15.54 2.81 46.01 1.56 +Car 0.00 0 2.03 286.23 196.48 337.13 220.85 1.38 1.79 2.94 -18.46 2.88 44.69 1.64 +Car 0.00 0 1.82 412.76 190.23 455.28 212.54 1.40 1.75 4.75 -12.25 2.66 50.40 1.58 +Car 0.00 1 2.67 960.19 183.87 1104.42 229.69 1.59 1.73 4.26 15.49 2.03 26.68 -3.09 +Car 0.39 3 2.53 1154.43 178.14 1241.00 222.29 1.55 1.76 3.59 22.52 1.76 26.55 -3.06 +Car 0.97 3 -2.06 1123.12 213.78 1241.00 374.00 1.30 1.64 3.71 4.52 1.63 4.08 -1.26 diff --git a/tests/assets/kitti3d/label_2/train/000046.txt b/tests/assets/kitti3d/label_2/train/000046.txt new file mode 100644 index 00000000000..59ccea26494 --- /dev/null +++ b/tests/assets/kitti3d/label_2/train/000046.txt @@ -0,0 +1,6 @@ +Car 0.00 0 -1.56 578.47 176.02 625.98 221.24 1.54 1.63 3.67 -0.32 1.68 26.76 -1.58 +Car 0.00 2 -1.58 597.59 168.75 634.85 207.61 1.77 1.69 4.32 0.26 1.61 35.33 -1.57 +Car 0.02 0 -1.89 769.21 151.69 1087.61 374.00 1.74 1.62 4.13 2.87 1.61 7.64 -1.54 +Car 0.00 1 -1.77 706.06 174.92 807.35 254.28 1.53 1.46 3.26 3.03 1.59 15.65 -1.59 +Car 0.00 0 -1.67 660.18 171.93 703.11 210.02 1.60 1.56 3.45 3.13 1.59 32.30 -1.57 +Car 0.00 2 -1.68 646.40 174.21 684.26 204.82 1.52 1.53 4.07 2.85 1.62 38.20 -1.61 diff --git a/tests/assets/kitti3d/label_2/train/000055.txt b/tests/assets/kitti3d/label_2/train/000055.txt new file mode 100644 index 00000000000..80b020cac95 --- /dev/null +++ b/tests/assets/kitti3d/label_2/train/000055.txt @@ -0,0 +1,4 @@ +Car 0.00 1 2.24 31.61 193.32 129.71 230.77 1.48 1.35 3.93 -23.47 2.44 32.12 1.62 +Car 0.00 1 2.14 104.42 188.01 188.34 220.56 1.50 1.62 4.08 -23.45 2.30 36.62 1.58 +Car 0.00 1 2.08 183.23 187.33 245.86 217.54 1.65 1.57 3.82 -23.27 2.53 42.56 1.58 +Car 0.00 1 2.04 227.36 189.17 281.98 211.96 1.37 1.36 4.44 -23.40 2.48 47.64 1.59 diff --git a/tests/assets/kitti3d/label_2/val/000023.txt b/tests/assets/kitti3d/label_2/val/000023.txt new file mode 100644 index 00000000000..515e01f2abf --- /dev/null +++ b/tests/assets/kitti3d/label_2/val/000023.txt @@ -0,0 +1 @@ +Car 0.00 0 1.86 372.95 182.64 412.21 205.68 1.67 1.87 3.69 -16.57 2.43 55.08 1.57 diff --git a/tests/assets/kitti3d/label_2/val/000025.txt b/tests/assets/kitti3d/label_2/val/000025.txt new file mode 100644 index 00000000000..70b2887dd78 --- /dev/null +++ b/tests/assets/kitti3d/label_2/val/000025.txt @@ -0,0 +1,5 @@ +Car 0.94 3 -2.10 896.11 218.17 1241.00 374.00 1.39 1.44 3.08 2.43 1.68 3.14 -1.49 +Car 0.00 0 -1.29 351.84 183.19 537.77 308.64 1.47 1.60 3.66 -2.21 1.63 10.42 -1.49 +Car 0.00 0 1.75 562.48 173.46 618.49 217.36 1.70 1.63 4.08 -0.78 1.75 30.18 1.72 +Car 0.00 0 -1.69 724.21 178.91 805.39 249.94 1.59 1.59 2.47 3.64 1.75 17.48 -1.49 +Car 0.00 1 -1.62 720.81 187.01 779.98 236.22 1.37 1.59 3.22 4.23 1.83 22.30 -1.44 diff --git a/tests/assets/kitti3d/label_2/val/000037.txt b/tests/assets/kitti3d/label_2/val/000037.txt new file mode 100644 index 00000000000..49043194d9f --- /dev/null +++ b/tests/assets/kitti3d/label_2/val/000037.txt @@ -0,0 +1,2 @@ +Car 0.00 0 -1.57 555.85 173.64 628.69 240.05 1.60 1.76 3.84 -0.45 1.32 19.43 -1.59 +Car 0.00 2 -1.46 473.07 183.18 535.73 226.04 1.30 1.61 4.39 -3.46 1.24 24.49 -1.60 diff --git a/tests/integration/cli/test_cli.py b/tests/integration/cli/test_cli.py index 7c3c9d2a959..f571dc2ed2c 100644 --- a/tests/integration/cli/test_cli.py +++ b/tests/integration/cli/test_cli.py @@ -195,6 +195,10 @@ def test_otx_e2e( print("Inference and explain are not supported for keypoint detection") return + if "monodetr3d" in recipe: + print("Inference and explain are not supported for object detection 3d") + return + # 4) infer of the exported models ov_output_dir = tmp_path_test / "outputs" / "OPENVINO" ov_files = list(ov_output_dir.rglob("exported*.xml")) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 0802ab8485f..cffbbcaa7ac 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -75,6 +75,8 @@ def get_task_list(task: str) -> list[OTXTaskType]: tasks = [OTXTaskType.ANOMALY_SEGMENTATION] elif task == "keypoint_detection": tasks = [OTXTaskType.KEYPOINT_DETECTION] + elif task == "object_detection_3d": + tasks = [OTXTaskType.OBJECT_DETECTION_3D] else: tasks = [OTXTaskType(task.upper())] return tasks @@ -142,6 +144,7 @@ def fxt_target_dataset_per_task() -> dict: "anomaly_detection": "tests/assets/anomaly_hazelnut", "anomaly_segmentation": "tests/assets/anomaly_hazelnut", "keypoint_detection": "tests/assets/car_tree_bug_keypoint", + "object_detection_3d": "tests/assets/kitti3d", "tiling_detection": "tests/assets/tiling_small_objects", } @@ -164,4 +167,5 @@ def fxt_cli_override_command_per_task() -> dict: "anomaly_detection": [], "anomaly_segmentation": [], "keypoint_detection": [], + "object_detection_3d": [], } diff --git a/tests/perf/test_object_detection_3d.py b/tests/perf/test_object_detection_3d.py new file mode 100644 index 00000000000..74a5bb43ca9 --- /dev/null +++ b/tests/perf/test_object_detection_3d.py @@ -0,0 +1,79 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""OTX 3d detection perfomance benchmark tests.""" + +from __future__ import annotations + +from pathlib import Path +from typing import ClassVar + +import pytest + +from .benchmark import Benchmark +from .conftest import PerfTestBase + + +class TestPerfObjectDetection3D(PerfTestBase): + """Benchmark visual prompting.""" + + MODEL_TEST_CASES = [ # noqa: RUF012 + Benchmark.Model(task="object_detection_3d", name="monodetr3d", category="balance"), + ] + + DATASET_TEST_CASES: ClassVar = [ + Benchmark.Dataset( + name="kitti_medium_pedestrian_cyclist", + path=Path("object_detection_3d/medium_pedestrian_cyclist"), + group="medium", + num_repeat=5, + extra_overrides={}, + ), + Benchmark.Dataset( + name="kitti_large_car", + path=Path("object_detection_3d/large_car"), + group="large", + num_repeat=5, + extra_overrides={}, + ), + ] + + BENCHMARK_CRITERIA = [ # noqa: RUF012 + Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="val/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1), + ] + + @pytest.mark.parametrize( + "fxt_model", + MODEL_TEST_CASES, + ids=lambda model: model.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_dataset", + DATASET_TEST_CASES, + ids=lambda dataset: dataset.name, + indirect=True, + ) + def test_perf( + self, + fxt_model: Benchmark.Model, + fxt_dataset: Benchmark.Dataset, + fxt_benchmark: Benchmark, + ): + self._test_perf( + model=fxt_model, + dataset=fxt_dataset, + benchmark=fxt_benchmark, + criteria=self.BENCHMARK_CRITERIA, + ) diff --git a/tox.ini b/tox.ini index 74e20c98e7b..7aa0fa1ad5b 100644 --- a/tox.ini +++ b/tox.ini @@ -27,6 +27,7 @@ task = anomaly_classification: "anomaly_classification" anomaly_detection: "anomaly_detection" anomaly_segmentation: "anomaly_segmentation" + object_detection_3d: "object_detection_3d" passenv = ftp_proxy HTTP_PROXY @@ -55,7 +56,7 @@ commands = {posargs} -[testenv:integration-test-{all, action, classification, multi_cls_classification, multi_label_classification, hlabel_classification, detection, rotated_detection, keypoint_detection, instance_segmentation, semantic_segmentation, visual_prompting_all, visual_prompting, zero_shot_visual_prompting, anomaly, anomaly_classification, anomaly_detection, anomaly_segmentation}] +[testenv:integration-test-{all, action, classification, multi_cls_classification, multi_label_classification, hlabel_classification, detection, rotated_detection, keypoint_detection, instance_segmentation, semantic_segmentation, visual_prompting_all, visual_prompting, zero_shot_visual_prompting, anomaly, anomaly_classification, anomaly_detection, anomaly_segmentation, object_detection_3d}] setenv = CUBLAS_WORKSPACE_CONFIG=:4096:8 deps = @@ -64,7 +65,7 @@ commands = python -m pytest tests/integration -ra --showlocals --csv={toxworkdir}/{envname}.csv --task {[testenv]task} --open-subprocess {posargs} -[testenv:e2e-test-{all, action, classification, detection, rotated_detection, keypoint_detection, instance_segmentation, semantic_segmentation, visual_prompting, anomaly}] +[testenv:e2e-test-{all, action, classification, detection, rotated_detection, keypoint_detection, instance_segmentation, semantic_segmentation, visual_prompting, anomaly, object_detection_3d}] setenv = CUBLAS_WORKSPACE_CONFIG=:4096:8 deps =