diff --git a/.github/workflows/pre_merge.yaml b/.github/workflows/pre_merge.yaml
index 1039ad01669..75d9095e31b 100644
--- a/.github/workflows/pre_merge.yaml
+++ b/.github/workflows/pre_merge.yaml
@@ -48,9 +48,7 @@ jobs:
         include:
           - python-version: "3.10"
             tox-env: "py310"
-            # TODO(vinnamki): Revisit after fixing in the upstream: https://github.com/omni-us/jsonargparse/issues/484
-            # Ticket no. 138075
-          - python-version: "3.11.8"
+          - python-version: "3.11"
             tox-env: "py311"
     name: Unit-Test-with-Python${{ matrix.python-version }}
     steps:
@@ -112,6 +110,7 @@ jobs:
           - task: "anomaly_detection"
           - task: "anomaly_segmentation"
           - task: "keypoint_detection"
+          - task: "object_detection_3d"
     name: Integration-Test-${{ matrix.task }}-py310
     steps:
       - name: Checkout repository
diff --git a/pyproject.toml b/pyproject.toml
index 61f8c6f783e..b1ab8f0e6ef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
 ]
 dependencies = [
-    "datumaro==1.7.0",
+    "datumaro==1.10.0rc0",
     "omegaconf==2.3.0",
     "rich==13.8.0",
     "jsonargparse==4.30.0",
@@ -39,6 +39,7 @@ dependencies = [
     "einops==0.8.0",
     "decord==0.6.0",
     "typeguard==4.3.*",
+    "numba==0.60.0",
     # TODO(ashwinvaidya17): https://github.com/openvinotoolkit/anomalib/issues/2126
     "setuptools<70",
 ]
diff --git a/src/otx/algo/common/layers/transformer_layers.py b/src/otx/algo/common/layers/transformer_layers.py
new file mode 100644
index 00000000000..0c3ede9116a
--- /dev/null
+++ b/src/otx/algo/common/layers/transformer_layers.py
@@ -0,0 +1,122 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Implementation of common transformer layers."""
+
+from __future__ import annotations
+
+import copy
+from typing import Callable
+
+import torch
+from torch import nn
+
+
+class TransformerEncoderLayer(nn.Module):
+    """TransformerEncoderLayer."""
+
+    def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        dim_feedforward: int = 2048,
+        dropout: float = 0.1,
+        activation: Callable[..., nn.Module] = nn.GELU,
+        normalize_before: bool = False,
+        batch_first: bool = True,
+        key_mask: bool = False,
+    ) -> None:
+        super().__init__()
+        self.normalize_before = normalize_before
+        self.key_mask = key_mask
+
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout, batch_first=batch_first)
+
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = activation()
+
+    @staticmethod
+    def with_pos_embed(tensor: torch.Tensor, pos_embed: torch.Tensor | None) -> torch.Tensor:
+        """Attach position embeddings to the tensor."""
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(
+        self,
+        src: torch.Tensor,
+        src_mask: torch.Tensor | None = None,
+        pos_embed: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Forward the transformer encoder layer.
+
+        Args:
+            src (torch.Tensor): The input tensor.
+            src_mask (torch.Tensor | None, optional): The mask tensor. Defaults to None.
+            pos_embed (torch.Tensor | None, optional): The position embedding tensor. Defaults to None.
+        """
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        if self.key_mask:
+            src = self.self_attn(q, k, value=src, key_padding_mask=src_mask)[0]
+        else:
+            src, _ = self.self_attn(q, k, value=src, attn_mask=src_mask)
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+class TransformerEncoder(nn.Module):
+    """TransformerEncoder."""
+
+    def __init__(self, encoder_layer: nn.Module, num_layers: int, norm: nn.Module | None = None) -> None:
+        """Initialize the TransformerEncoder.
+
+        Args:
+            encoder_layer (nn.Module): The encoder layer module.
+            num_layers (int): The number of layers.
+            norm (nn.Module | None, optional): The normalization module. Defaults to None.
+        """
+        super().__init__()
+        self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(
+        self,
+        src: torch.Tensor,
+        src_mask: torch.Tensor | None = None,
+        pos_embed: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Forward the transformer encoder.
+
+        Args:
+            src (torch.Tensor): The input tensor.
+            src_mask (torch.Tensor | None, optional): The mask tensor. Defaults to None.
+            pos_embed (torch.Tensor | None, optional): The position embedding tensor. Defaults to None.
+        """
+        output = src
+        for layer in self.layers:
+            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
diff --git a/src/otx/algo/common/losses/focal_loss.py b/src/otx/algo/common/losses/focal_loss.py
index 9ad2f1323b2..4eb3914957f 100644
--- a/src/otx/algo/common/losses/focal_loss.py
+++ b/src/otx/algo/common/losses/focal_loss.py
@@ -8,11 +8,12 @@
 
 from __future__ import annotations
 
+import warnings
 from typing import TYPE_CHECKING
 
 import torch
-import torch.nn.functional
 from otx.algo.common.losses.utils import weight_reduce_loss
+from torch import nn
 
 if TYPE_CHECKING:
     from torch import Tensor
@@ -50,7 +51,7 @@ def py_sigmoid_focal_loss(
     pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
     # Thus it's pt.pow(gamma) rather than (1 - pt).pow(gamma)
     focal_weight = (alpha * target + (1 - alpha) * (1 - target)) * pt.pow(gamma)
-    loss = torch.nn.functional.binary_cross_entropy_with_logits(pred, target, reduction="none") * focal_weight
+    loss = nn.functional.binary_cross_entropy_with_logits(pred, target, reduction="none") * focal_weight
     if weight is not None:
         if weight.shape != loss.shape:
             if weight.size(0) == loss.size(0):
@@ -70,3 +71,180 @@ def py_sigmoid_focal_loss(
             msg = "The number of dimensions in weight should be equal to the number of dimensions in loss."
             raise ValueError(msg)
     return weight_reduce_loss(loss, weight, reduction, avg_factor)
+
+
+def one_hot(
+    labels: torch.Tensor,
+    num_classes: int,
+    device: torch.device | None = None,
+    dtype: torch.dtype | None = None,
+    eps: float = 1e-6,
+) -> torch.Tensor:
+    r"""Convert an integer label x-D tensor to a one-hot (x+1)-D tensor.
+
+    Args:
+        labels: tensor with labels of shape :math:`(N, *)`, where N is batch size.
+          Each value is an integer representing correct classification.
+        num_classes: number of classes in labels.
+        device: the desired device of returned tensor.
+        dtype: the desired data type of returned tensor.
+
+    Returns:
+        the labels in one hot tensor of shape :math:`(N, C, *)`,
+
+    Examples:
+        >>> labels = torch.LongTensor([[[0, 1], [2, 0]]])
+        >>> one_hot(labels, num_classes=3)
+        tensor([[[[1.0000e+00, 1.0000e-06],
+                  [1.0000e-06, 1.0000e+00]],
+        <BLANKLINE>
+                 [[1.0000e-06, 1.0000e+00],
+                  [1.0000e-06, 1.0000e-06]],
+        <BLANKLINE>
+                 [[1.0000e-06, 1.0000e-06],
+                  [1.0000e+00, 1.0000e-06]]]])
+    """
+    if not isinstance(labels, torch.Tensor):
+        msg = f"Input labels type is not a torch.Tensor. Got {type(labels)}"
+        raise TypeError(msg)
+
+    if labels.dtype != torch.int64:
+        msg = f"labels must be of the same dtype torch.int64. Got: {labels.dtype}"
+        raise ValueError(msg)
+
+    if num_classes < 1:
+        msg = f"The number of classes must be bigger than one. Got: {num_classes}"
+        raise ValueError(msg)
+    shape = labels.shape
+    one_hot = torch.zeros((shape[0], num_classes) + shape[1:], device=device, dtype=dtype)
+    return one_hot.scatter_(1, labels.unsqueeze(1), 1.0) + eps
+
+
+def focal_loss(
+    inputs: torch.Tensor,
+    target: torch.Tensor,
+    alpha: float,
+    gamma: float = 2.0,
+    reduction: str = "none",
+    eps: float | None = None,
+) -> torch.Tensor:
+    r"""Criterion that computes Focal loss.
+
+    According to :cite:`lin2018focal`, the Focal loss is computed as follows:
+    .. math::
+        \text{FL}(p_t) = -\alpha_t (1 - p_t)^{\gamma} \, \text{log}(p_t)
+    Where:
+       - :math:`p_t` is the model's estimated probability for each class.
+
+    Args:
+        inputs: logits tensor with shape :math:`(N, C, *)` where C = number of classes.
+        target: labels tensor with shape :math:`(N, *)` where each value is :math:`0 ≤ targets[i] ≤ C-1`.
+        alpha: Weighting factor :math:`\alpha \in [0, 1]`.
+        gamma: Focusing parameter :math:`\gamma >= 0`.
+        reduction: Specifies the reduction to apply to the
+          output: ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction
+          will be applied, ``'mean'``: the sum of the output will be divided by
+          the number of elements in the output, ``'sum'``: the output will be
+          summed.
+        eps: Deprecated: scalar to enforce numerical stabiliy. This is no longer used.
+
+    Return:
+        the computed loss.
+
+    Example:
+        >>> N = 5  # num_classes
+        >>> inputs = torch.randn(1, N, 3, 5, requires_grad=True)
+        >>> target = torch.empty(1, 3, 5, dtype=torch.long).random_(N)
+        >>> output = focal_loss(inputs, target, alpha=0.5, gamma=2.0, reduction='mean')
+        >>> output.backward()
+    """
+    if eps is not None and not torch.jit.is_scripting():
+        warnings.warn(
+            "`focal_loss` has been reworked for improved numerical stability "
+            "and the `eps` argument is no longer necessary",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
+    if not isinstance(inputs, torch.Tensor):
+        msg = f"inputs type is not a torch.Tensor. Got {type(inputs)}"
+        raise TypeError(msg)
+
+    if not len(inputs.shape) >= 2:
+        msg = f"Invalid inputs shape, we expect BxCx*. Got: {inputs.shape}"
+        raise ValueError(msg)
+
+    if inputs.size(0) != target.size(0):
+        msg = f"Expected inputs batch_size ({inputs.size(0)}) to match target batch_size ({target.size(0)})."
+        raise ValueError(msg)
+
+    n = inputs.size(0)
+    out_size = (n,) + inputs.size()[2:]
+    if target.size()[1:] != inputs.size()[2:]:
+        msg = f"Expected target size {out_size}, got {target.size()}"
+        raise ValueError(msg)
+
+    if inputs.device != target.device:
+        msg = f"inputs and target must be in the same device. Got: {inputs.device} and {target.device}"
+        raise ValueError(msg)
+
+    # compute softmax over the classes axis
+    input_soft: torch.Tensor = nn.functional.softmax(inputs, dim=1)
+    log_input_soft: torch.Tensor = nn.functional.log_softmax(inputs, dim=1)
+    # create the labels one hot tensor
+    target_one_hot: torch.Tensor = one_hot(
+        target,
+        num_classes=inputs.shape[1],
+        device=inputs.device,
+        dtype=inputs.dtype,
+    )
+
+    # compute the actual focal loss
+    weight = torch.pow(-input_soft + 1.0, gamma)
+
+    focal = -alpha * weight * log_input_soft
+    loss_tmp = torch.einsum("bc...,bc...->b...", (target_one_hot, focal))
+    return weight_reduce_loss(loss_tmp, reduction=reduction, avg_factor=None)
+
+
+class FocalLoss(nn.Module):
+    """Criterion that computes Focal loss."""
+
+    def __init__(self, alpha: float, gamma: float = 2.0, reduction: str = "none", eps: float | None = None) -> None:
+        r"""Criterion that computes Focal loss.
+
+        According to :cite:`lin2018focal`, the Focal loss is computed as follows:
+        .. math::
+        \text{FL}(p_t) = -\alpha_t (1 - p_t)^{\\gamma} \\, \text{log}(p_t)
+        Where:
+        - :math:`p_t` is the model's estimated probability for each class.
+
+        Args:
+        alpha: Weighting factor :math:`\alpha \\in [0, 1]`.
+        gamma: Focusing parameter :math:`\\gamma >= 0`.
+        reduction: Specifies the reduction to apply to the
+        output: ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction
+        will be applied, ``'mean'``: the sum of the output will be divided by
+        the number of elements in the output, ``'sum'``: the output will be
+        summed.
+        eps: Deprecated: scalar to enforce numerical stability. This is no longer
+        used.
+
+        Example:
+        >>> N = 5  # num_classes
+        >>> kwargs = {"alpha": 0.5, "gamma": 2.0, "reduction": 'mean'}
+        >>> criterion = FocalLoss(**kwargs)
+        >>> input = torch.randn(1, N, 3, 5, requires_grad=True)
+        >>> target = torch.empty(1, 3, 5, dtype=torch.long).random_(N)
+        >>> output = criterion(input, target)
+        >>> output.backward()
+        """
+        super().__init__()
+        self.alpha: float = alpha
+        self.gamma: float = gamma
+        self.reduction: str = reduction
+        self.eps: float | None = eps
+
+    def forward(self, inputs: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        """Forward."""
+        return focal_loss(inputs, target, self.alpha, self.gamma, self.reduction, self.eps)
diff --git a/src/otx/algo/detection/heads/rtdetr_decoder.py b/src/otx/algo/detection/heads/rtdetr_decoder.py
index 2d190dcaf32..d60be84f12d 100644
--- a/src/otx/algo/detection/heads/rtdetr_decoder.py
+++ b/src/otx/algo/detection/heads/rtdetr_decoder.py
@@ -213,7 +213,7 @@ def forward(
         query: torch.Tensor,
         reference_points: torch.Tensor,
         value: torch.Tensor,
-        value_spatial_shapes: list[tuple[int, int]],
+        value_spatial_shapes: torch.Tensor,
         value_mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward function of MSDeformableAttention.
@@ -235,8 +235,9 @@ def forward(
 
         value = self.value_proj(value)
         if value_mask is not None:
-            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
-            value *= value_mask
+            value = value.masked_fill(value_mask[..., None], float(0))
+            # value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
+            # value3 = value * value_mask.unsqueeze(-1)
         value = value.reshape(bs, len_v, self.num_heads, self.head_dim)
 
         sampling_offsets = self.sampling_offsets(query).reshape(
@@ -262,7 +263,7 @@ def forward(
         )
 
         if reference_points.shape[-1] == 2:
-            offset_normalizer = torch.tensor(value_spatial_shapes)
+            offset_normalizer = value_spatial_shapes.clone()
             offset_normalizer = offset_normalizer.flip([1]).reshape(1, 1, 1, self.num_levels, 1, 2)
             sampling_locations = (
                 reference_points.reshape(
@@ -280,6 +281,14 @@ def forward(
                 reference_points[:, :, None, :, None, :2]
                 + sampling_offsets / self.num_points * reference_points[:, :, None, :, None, 2:] * 0.5
             )
+        elif reference_points.shape[-1] == 6:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets
+                / self.num_points
+                * (reference_points[:, :, None, :, None, 2::2] + reference_points[:, :, None, :, None, 3::2])
+                * 0.5
+            )
         else:
             msg = f"Last dim of reference_points must be 2 or 4, but get {reference_points.shape[-1]} instead."
             raise ValueError(
diff --git a/src/otx/algo/detection/necks/hybrid_encoder.py b/src/otx/algo/detection/necks/hybrid_encoder.py
index cf79424636f..548a14548ec 100644
--- a/src/otx/algo/detection/necks/hybrid_encoder.py
+++ b/src/otx/algo/detection/necks/hybrid_encoder.py
@@ -12,6 +12,7 @@
 import torch
 from torch import nn
 
+from otx.algo.common.layers.transformer_layers import TransformerEncoder, TransformerEncoderLayer
 from otx.algo.detection.layers import CSPRepLayer
 from otx.algo.modules import Conv2dModule, build_activation_layer
 from otx.algo.modules.base_module import BaseModule
@@ -20,85 +21,6 @@
 __all__ = ["HybridEncoder"]
 
 
-# transformer
-class TransformerEncoderLayer(nn.Module):
-    def __init__(
-        self,
-        d_model: int,
-        nhead: int,
-        dim_feedforward: int = 2048,
-        dropout: float = 0.1,
-        activation: Callable[..., nn.Module] = nn.GELU,
-        normalize_before: bool = False,
-    ) -> None:
-        super().__init__()
-        self.normalize_before = normalize_before
-
-        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout, batch_first=True)
-
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-        self.activation = activation()
-
-    @staticmethod
-    def with_pos_embed(tensor: torch.Tensor, pos_embed: torch.Tensor | None) -> torch.Tensor:
-        return tensor if pos_embed is None else tensor + pos_embed
-
-    def forward(
-        self,
-        src: torch.Tensor,
-        src_mask: torch.Tensor | None = None,
-        pos_embed: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        residual = src
-        if self.normalize_before:
-            src = self.norm1(src)
-        q = k = self.with_pos_embed(src, pos_embed)
-        src, _ = self.self_attn(q, k, value=src, attn_mask=src_mask)
-
-        src = residual + self.dropout1(src)
-        if not self.normalize_before:
-            src = self.norm1(src)
-
-        residual = src
-        if self.normalize_before:
-            src = self.norm2(src)
-        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
-        src = residual + self.dropout2(src)
-        if not self.normalize_before:
-            src = self.norm2(src)
-        return src
-
-
-class TransformerEncoder(nn.Module):
-    def __init__(self, encoder_layer: nn.Module, num_layers: int, norm: nn.Module | None = None) -> None:
-        super().__init__()
-        self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(num_layers)])
-        self.num_layers = num_layers
-        self.norm = norm
-
-    def forward(
-        self,
-        src: torch.Tensor,
-        src_mask: torch.Tensor | None = None,
-        pos_embed: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        output = src
-        for layer in self.layers:
-            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
-
-        if self.norm is not None:
-            output = self.norm(output)
-
-        return output
-
-
 class HybridEncoderModule(BaseModule):
     """HybridEncoder for RTDetr.
 
diff --git a/src/otx/algo/object_detection_3d/__init__.py b/src/otx/algo/object_detection_3d/__init__.py
new file mode 100644
index 00000000000..c9797fe2795
--- /dev/null
+++ b/src/otx/algo/object_detection_3d/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Custom model implementations for object detection 3D task."""
+
+from . import backbones, detectors, heads, losses, matchers, utils
+
+__all__ = ["backbones", "heads", "losses", "detectors", "matchers", "utils"]
diff --git a/src/otx/algo/object_detection_3d/backbones/__init__.py b/src/otx/algo/object_detection_3d/backbones/__init__.py
new file mode 100644
index 00000000000..a7d354222db
--- /dev/null
+++ b/src/otx/algo/object_detection_3d/backbones/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Backbones modules for 3d object detection."""
diff --git a/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py b/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py
new file mode 100644
index 00000000000..0d345aa11a5
--- /dev/null
+++ b/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py
@@ -0,0 +1,253 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""MonoDetr backbone implementations."""
+from __future__ import annotations
+
+import math
+from typing import Any, ClassVar
+
+import torch
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+
+from otx.algo.modules.norm import FrozenBatchNorm2d
+from otx.algo.object_detection_3d.utils.utils import NestedTensor
+
+
+class PositionEmbeddingSine(nn.Module):
+    """This is a more standard version of the position embedding."""
+
+    def __init__(
+        self,
+        num_pos_feats: int = 64,
+        temperature: int = 10000,
+        normalize: bool = False,
+        scale: float | None = None,
+    ):
+        """Initialize the PositionEmbeddingSine module.
+
+        Args:
+            num_pos_feats (int): Number of positional features.
+            temperature (int): Temperature scaling factor.
+            normalize (bool): Flag indicating whether to normalize the position embeddings.
+            scale (Optional[float]): Scaling factor for the position embeddings. If None, default value is used.
+        """
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            msg = "normalize should be True if scale is passed"
+            raise ValueError(msg)
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, tensor_list: NestedTensor) -> torch.Tensor:
+        """Forward function for PositionEmbeddingSine module."""
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        return torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+
+
+class PositionEmbeddingLearned(nn.Module):
+    """Absolute pos embedding, learned."""
+
+    def __init__(self, num_pos_feats: int = 256):
+        """Positional embedding."""
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+
+    def forward(self, tensor_list: NestedTensor) -> torch.Tensor:
+        """Forward pass of the PositionEmbeddingLearned module.
+
+        Args:
+            tensor_list (NestedTensor): Input tensor.
+
+        Returns:
+            torch.Tensor: Position embeddings.
+        """
+        x = tensor_list.tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device) / w * 49
+        j = torch.arange(h, device=x.device) / h * 49
+        x_emb = self.get_embed(i, self.col_embed)
+        y_emb = self.get_embed(j, self.row_embed)
+        return (
+            torch.cat(
+                [
+                    x_emb.unsqueeze(0).repeat(h, 1, 1),
+                    y_emb.unsqueeze(1).repeat(1, w, 1),
+                ],
+                dim=-1,
+            )
+            .permute(2, 0, 1)
+            .unsqueeze(0)
+            .repeat(x.shape[0], 1, 1, 1)
+        )
+
+    def get_embed(self, coord: torch.Tensor, embed: nn.Embedding) -> torch.Tensor:
+        """Get the embedding for the given coordinates.
+
+        Args:
+            coord (torch.Tensor): The coordinates.
+            embed (nn.Embedding): The embedding layer.
+
+        Returns:
+            torch.Tensor: The embedding for the coordinates.
+        """
+        floor_coord = coord.floor()
+        delta = (coord - floor_coord).unsqueeze(-1)
+        floor_coord = floor_coord.long()
+        ceil_coord = (floor_coord + 1).clamp(max=49)
+        return embed(floor_coord) * (1 - delta) + embed(ceil_coord) * delta
+
+
+def build_position_encoding(
+    hidden_dim: int,
+    position_embedding: str | PositionEmbeddingSine | PositionEmbeddingLearned,
+) -> PositionEmbeddingSine | PositionEmbeddingLearned:
+    """Build the position encoding module.
+
+    Args:
+        hidden_dim (int): The hidden dimension.
+        position_embedding (Union[str, PositionEmbeddingSine, PositionEmbeddingLearned]): The position embedding type.
+
+    Returns:
+        Union[PositionEmbeddingSine, PositionEmbeddingLearned]: The position encoding module.
+    """
+    n_steps = hidden_dim // 2
+    if position_embedding in ("v2", "sine"):
+        position_embedding = PositionEmbeddingSine(n_steps, normalize=True)
+    elif position_embedding in ("v3", "learned"):
+        position_embedding = PositionEmbeddingLearned(n_steps)
+    else:
+        msg = f"not supported {position_embedding}"
+        raise ValueError(msg)
+
+    return position_embedding
+
+
+class BackboneBase(nn.Module):
+    """BackboneBase module."""
+
+    def __init__(self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool):
+        """Initializes BackboneBase module."""
+        super().__init__()
+        for name, parameter in backbone.named_parameters():
+            if not train_backbone or "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                parameter.requires_grad_(False)
+        if return_interm_layers:
+            return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"}
+            self.strides = [8, 16, 32]
+            self.num_channels = [512, 1024, 2048]
+        else:
+            return_layers = {"layer4": "0"}
+            self.strides = [32]
+            self.num_channels = [2048]
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+
+    def forward(self, images: torch.Tensor) -> dict[str, NestedTensor]:
+        """Forward pass of the BackboneBase module.
+
+        Args:
+            images (torch.Tensor): Input images.
+
+        Returns:
+            dict[str, NestedTensor]: Output tensors.
+        """
+        xs = self.body(images)
+        out = {}
+        for name, x in xs.items():
+            m = torch.zeros(x.shape[0], x.shape[2], x.shape[3]).to(torch.bool).to(x.device)
+            out[name] = NestedTensor(x, m)
+        return out
+
+
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+
+    def __init__(self, name: str, train_backbone: bool, return_interm_layers: bool, dilation: bool, **kwargs):
+        """Initializes Backbone module."""
+        norm_layer = FrozenBatchNorm2d
+        backbone = getattr(torchvision.models, name)(
+            replace_stride_with_dilation=[False, False, dilation],
+            pretrained=True,
+            norm_layer=norm_layer,
+        )
+        super().__init__(backbone, train_backbone, return_interm_layers)
+        if dilation:
+            self.strides[-1] = self.strides[-1] // 2
+
+
+class Joiner(nn.Sequential):
+    """Joiner module."""
+
+    def __init__(
+        self,
+        backbone: nn.Module,
+        position_embedding: PositionEmbeddingSine | PositionEmbeddingLearned,
+    ) -> None:
+        """Initialize the Joiner module.
+
+        Args:
+            backbone (nn.Module): The backbone module.
+            position_embedding (Union[PositionEmbeddingSine, PositionEmbeddingLearned]): The position embedding module.
+        """
+        super().__init__(backbone, position_embedding)
+        self.strides = backbone.strides
+        self.num_channels = backbone.num_channels
+
+    def forward(self, images: torch.Tensor) -> tuple[list[NestedTensor], list[torch.Tensor]]:
+        """Forward pass of the Joiner module.
+
+        Args:
+            images (torch.Tensor): Input images.
+
+        Returns:
+            tuple[List[NestedTensor], List[torch.Tensor]]: Output tensors and position embeddings.
+        """
+        out: list[NestedTensor] = [x for _, x in sorted(self[0](images).items())]
+        return out, [self[1](x).to(x.tensors.dtype) for x in out]
+
+
+class BackboneBuilder:
+    """DepthAwareTransformerBuilder."""
+
+    CFG: ClassVar[dict[str, Any]] = {
+        "monodetr_50": {
+            "name": "resnet50",
+            "train_backbone": True,
+            "dilation": False,
+            "return_interm_layers": True,
+            "positional_encoding": {
+                "hidden_dim": 256,
+                "position_embedding": "sine",
+            },
+        },
+    }
+
+    def __new__(cls, model_name: str) -> Joiner:
+        """Constructor for Backbone MonoDetr."""
+        # TODO (Kirill): change backbone to already implemented in OTX
+        backbone = Backbone(**cls.CFG[model_name])
+        position_embedding = build_position_encoding(**cls.CFG[model_name]["positional_encoding"])
+        return Joiner(backbone, position_embedding)
diff --git a/src/otx/algo/object_detection_3d/detectors/__init__.py b/src/otx/algo/object_detection_3d/detectors/__init__.py
new file mode 100644
index 00000000000..9cbb7eee8fc
--- /dev/null
+++ b/src/otx/algo/object_detection_3d/detectors/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Core torch detectors modules for 3d object detection."""
diff --git a/src/otx/algo/object_detection_3d/detectors/monodetr.py b/src/otx/algo/object_detection_3d/detectors/monodetr.py
new file mode 100644
index 00000000000..b94c2dd2b58
--- /dev/null
+++ b/src/otx/algo/object_detection_3d/detectors/monodetr.py
@@ -0,0 +1,313 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""MonoDetr core Pytorch detector."""
+from __future__ import annotations
+
+import math
+from typing import Callable
+
+import torch
+from torch import Tensor, nn
+from torch.nn import functional
+
+from otx.algo.common.utils.utils import inverse_sigmoid
+from otx.algo.detection.heads.rtdetr_decoder import MLP
+from otx.algo.object_detection_3d.utils.utils import NestedTensor, get_clones
+
+
+# TODO (Kirill): make MonoDETR as a more general class
+class MonoDETR(nn.Module):
+    """This is the MonoDETR module that performs monocualr 3D object detection."""
+
+    def __init__(
+        self,
+        backbone: nn.Module,
+        depthaware_transformer: nn.Module,
+        depth_predictor: nn.Module,
+        criterion: nn.Module,
+        num_classes: int,
+        num_queries: int,
+        num_feature_levels: int,
+        aux_loss: bool = True,
+        with_box_refine: bool = False,
+        init_box: bool = False,
+        group_num: int = 11,
+        activation: Callable[..., nn.Module] = nn.ReLU,
+    ):
+        """Initializes the model.
+
+        Args:
+            backbone (nn.Module): torch module of the backbone to be used. See backbone.py
+            depthaware_transformer (nn.Module): depth-aware transformer architecture. See depth_aware_transformer.py
+            depth_predictor (nn.Module): depth predictor module
+            criterion (nn.Module): loss criterion module
+            num_classes (int): number of object classes
+            num_queries (int): number of object queries, ie detection slot. This is the maximal number of objects
+                       DETR can detect in a single image. For KITTI, we recommend 50 queries.
+            num_feature_levels (int): number of feature levels
+            aux_loss (bool): True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+            with_box_refine (bool): iterative bounding box refinement
+            init_box (bool): True if the bounding box embedding layers should be initialized to zero
+            group_num (int): number of groups for depth-aware bounding box embedding
+            activation (Callable[..., nn.Module]): activation function to be applied to the output of the transformer
+        """
+        super().__init__()
+
+        self.num_queries = num_queries
+        self.depthaware_transformer = depthaware_transformer
+        self.depth_predictor = depth_predictor
+        hidden_dim = depthaware_transformer.d_model
+        self.hidden_dim = hidden_dim
+        self.num_feature_levels = num_feature_levels
+        self.criterion = criterion
+        self.label_enc = nn.Embedding(num_classes + 1, hidden_dim - 1)  # # for indicator
+        # prediction heads
+        self.class_embed = nn.Linear(hidden_dim, num_classes)
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        self.class_embed.bias.data = torch.ones(num_classes) * bias_value
+
+        self.bbox_embed = MLP(hidden_dim, hidden_dim, 6, 3, activation=activation)
+        self.dim_embed_3d = MLP(hidden_dim, hidden_dim, 3, 2, activation=activation)
+        self.angle_embed = MLP(hidden_dim, hidden_dim, 24, 2, activation=activation)
+        self.depth_embed = MLP(hidden_dim, hidden_dim, 2, 2, activation=activation)  # depth and deviation
+
+        if init_box:
+            nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
+            nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
+
+        self.query_embed = nn.Embedding(num_queries * group_num, hidden_dim * 2)
+
+        if num_feature_levels > 1:
+            num_backbone_outs = len(backbone.strides)
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = backbone.num_channels[_]
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    ),
+                )
+            for _ in range(num_feature_levels - num_backbone_outs):
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    ),
+                )
+                in_channels = hidden_dim
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            self.input_proj = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Conv2d(backbone.num_channels[0], hidden_dim, kernel_size=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    ),
+                ],
+            )
+
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+        self.with_box_refine = with_box_refine
+        self.num_classes = num_classes
+
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            nn.init.constant_(proj[0].bias, 0)
+        # if two-stage, the last class_embed and bbox_embed is for region proposal generation
+        num_pred = depthaware_transformer.decoder.num_layers
+        if with_box_refine:
+            self.class_embed = get_clones(self.class_embed, num_pred)
+            self.bbox_embed = get_clones(self.bbox_embed, num_pred)
+            nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
+            # implementation for iterative bounding box refinement
+            self.depthaware_transformer.decoder.bbox_embed = self.bbox_embed
+            self.dim_embed_3d = get_clones(self.dim_embed_3d, num_pred)
+            self.depthaware_transformer.decoder.dim_embed = self.dim_embed_3d
+            self.angle_embed = get_clones(self.angle_embed, num_pred)
+            self.depth_embed = get_clones(self.depth_embed, num_pred)
+        else:
+            nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
+            self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
+            self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
+            self.dim_embed_3d = nn.ModuleList([self.dim_embed_3d for _ in range(num_pred)])
+            self.angle_embed = nn.ModuleList([self.angle_embed for _ in range(num_pred)])
+            self.depth_embed = nn.ModuleList([self.depth_embed for _ in range(num_pred)])
+            self.depthaware_transformer.decoder.bbox_embed = None
+
+    def forward(
+        self,
+        images: Tensor,
+        calibs: Tensor,
+        img_sizes: Tensor,
+        targets: list[dict[str, Tensor]] | None = None,
+        mode: str = "predict",
+    ) -> dict[str, Tensor]:
+        """Forward method of the MonoDETR model.
+
+        Args:
+            images (list[Tensor]): images for each sample
+            calibs (Tensor): camera matrices for each sample
+            img_sizes (Tensor): image sizes for each sample
+            targets (list[dict[Tensor]): ground truth boxes and labels for each
+                sample
+            mode (str): The mode of operation. Defaults to "predict".
+        """
+        features, pos = self.backbone(images)
+
+        srcs = []
+        masks = []
+        for i, feat in enumerate(features):
+            src, mask = feat.decompose()
+            srcs.append(self.input_proj[i](src))
+            masks.append(mask)
+
+        if self.num_feature_levels > len(srcs):
+            _len_srcs = len(srcs)
+            for i in range(_len_srcs, self.num_feature_levels):
+                src = self.input_proj[i](features[-1].tensors) if i == _len_srcs else self.input_proj[i](srcs[-1])
+                m = torch.zeros(src.shape[0], src.shape[2], src.shape[3]).to(torch.bool).to(src.device)
+                mask = functional.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
+                srcs.append(src)
+                masks.append(mask)
+                pos.append(pos_l)
+
+        query_embeds = self.query_embed.weight if self.training else self.query_embed.weight[: self.num_queries]
+
+        pred_depth_map_logits, depth_pos_embed, weighted_depth, depth_pos_embed_ip = self.depth_predictor(
+            srcs,
+            masks[1],
+            pos[1],
+        )
+
+        (
+            hs,
+            init_reference,
+            inter_references,
+            inter_references_dim,
+            enc_outputs_class,
+            enc_outputs_coord_unact,
+        ) = self.depthaware_transformer(
+            srcs,
+            masks,
+            pos,
+            query_embeds,
+            depth_pos_embed,
+            depth_pos_embed_ip,
+        )
+
+        outputs_coords = []
+        outputs_classes = []
+        outputs_3d_dims = []
+        outputs_depths = []
+        outputs_angles = []
+
+        for lvl in range(hs.shape[0]):
+            reference = init_reference if lvl == 0 else inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+
+            tmp = self.bbox_embed[lvl](hs[lvl])
+            if reference.shape[-1] == 6:
+                tmp += reference
+            else:
+                tmp[..., :2] += reference
+
+            # 3d center + 2d box
+            outputs_coord = tmp.sigmoid()
+            outputs_coords.append(outputs_coord)
+
+            # classes
+            outputs_class = self.class_embed[lvl](hs[lvl])
+            outputs_classes.append(outputs_class)
+
+            # 3D sizes
+            size3d = inter_references_dim[lvl]
+            outputs_3d_dims.append(size3d)
+
+            # depth_geo
+            box2d_height_norm = outputs_coord[:, :, 4] + outputs_coord[:, :, 5]
+            box2d_height = torch.clamp(box2d_height_norm * img_sizes[:, 1:2], min=1.0)
+            depth_geo = size3d[:, :, 0] / box2d_height * calibs[:, 0, 0].unsqueeze(1)
+
+            # depth_reg
+            depth_reg = self.depth_embed[lvl](hs[lvl])
+
+            # depth_map
+            outputs_center3d = ((outputs_coord[..., :2] - 0.5) * 2).unsqueeze(2).detach()
+            depth_map = functional.grid_sample(
+                weighted_depth.unsqueeze(1),
+                outputs_center3d,
+                mode="bilinear",
+                align_corners=True,
+            ).squeeze(1)
+
+            # depth average + sigma
+            depth_ave = torch.cat(
+                [
+                    ((1.0 / (depth_reg[:, :, 0:1].sigmoid() + 1e-6) - 1.0) + depth_geo.unsqueeze(-1) + depth_map) / 3,
+                    depth_reg[:, :, 1:2],
+                ],
+                -1,
+            )
+            outputs_depths.append(depth_ave)
+
+            # angles
+            outputs_angle = self.angle_embed[lvl](hs[lvl])
+            outputs_angles.append(outputs_angle)
+
+        outputs_coord = torch.stack(outputs_coords)
+        outputs_class = torch.stack(outputs_classes)
+        outputs_3d_dim = torch.stack(outputs_3d_dims)
+        outputs_depth = torch.stack(outputs_depths)
+        outputs_angle = torch.stack(outputs_angles)
+
+        out = {"scores": outputs_class[-1], "boxes_3d": outputs_coord[-1]}
+        out["size_3d"] = outputs_3d_dim[-1]
+        out["depth"] = outputs_depth[-1]
+        out["heading_angle"] = outputs_angle[-1]
+        if mode == "export":
+            out["scores"] = out["scores"].sigmoid()
+            return out
+
+        out["pred_depth_map_logits"] = pred_depth_map_logits
+
+        if self.aux_loss:
+            out["aux_outputs"] = self._set_aux_loss(
+                outputs_class,
+                outputs_coord,
+                outputs_3d_dim,
+                outputs_angle,
+                outputs_depth,
+            )
+
+        if mode == "loss":
+            return self.criterion(outputs=out, targets=targets)
+
+        return out
+
+    @torch.jit.unused
+    def _set_aux_loss(
+        self,
+        outputs_class: Tensor,
+        outputs_coord: Tensor,
+        outputs_3d_dim: Tensor,
+        outputs_angle: Tensor,
+        outputs_depth: Tensor,
+    ) -> list[dict[str, Tensor]]:
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [
+            {"scores": a, "boxes_3d": b, "size_3d": c, "heading_angle": d, "depth": e}
+            for a, b, c, d, e in zip(
+                outputs_class[:-1],
+                outputs_coord[:-1],
+                outputs_3d_dim[:-1],
+                outputs_angle[:-1],
+                outputs_depth[:-1],
+            )
+        ]
diff --git a/src/otx/algo/object_detection_3d/heads/__init__.py b/src/otx/algo/object_detection_3d/heads/__init__.py
new file mode 100644
index 00000000000..72a504f2fbb
--- /dev/null
+++ b/src/otx/algo/object_detection_3d/heads/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""heads modules for 3d object detection."""
+
+from .depth_predictor import DepthPredictor
+from .depthaware_transformer import DepthAwareTransformerBuilder
+
+__all__ = ["DepthPredictor", "DepthAwareTransformerBuilder"]
diff --git a/src/otx/algo/object_detection_3d/heads/depth_predictor.py b/src/otx/algo/object_detection_3d/heads/depth_predictor.py
new file mode 100644
index 00000000000..4e5037c96d8
--- /dev/null
+++ b/src/otx/algo/object_detection_3d/heads/depth_predictor.py
@@ -0,0 +1,151 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""depth predictor transformer head for 3d object detection."""
+
+from __future__ import annotations
+
+from typing import Callable
+
+import torch
+from torch import nn
+from torch.nn import functional
+
+from otx.algo.common.layers.transformer_layers import TransformerEncoder, TransformerEncoderLayer
+
+
+class DepthPredictor(nn.Module):
+    """Depth predictor and depth encoder."""
+
+    def __init__(
+        self,
+        depth_num_bins: int,
+        depth_min: float,
+        depth_max: float,
+        hidden_dim: int,
+        activation: Callable[..., nn.Module] = nn.ReLU,
+    ) -> None:
+        """Initialize depth predictor and depth encoder.
+
+        Args:
+            depth_num_bins (int): The number of depth bins.
+            depth_min (float): The minimum depth value.
+            depth_max (float): The maximum depth value.
+            hidden_dim (int): The dimension of the hidden layer.
+        """
+        super().__init__()
+        self.depth_max = depth_max
+
+        bin_size = 2 * (depth_max - depth_min) / (depth_num_bins * (1 + depth_num_bins))
+        bin_indice = torch.linspace(0, depth_num_bins - 1, depth_num_bins)
+        bin_value = (bin_indice + 0.5).pow(2) * bin_size / 2 - bin_size / 8 + depth_min
+        bin_value = torch.cat([bin_value, torch.tensor([depth_max])], dim=0)
+        self.depth_bin_values = nn.Parameter(bin_value, requires_grad=False)
+
+        # Create modules
+        d_model = hidden_dim
+        self.downsample = nn.Sequential(
+            nn.Conv2d(d_model, d_model, kernel_size=(3, 3), stride=(2, 2), padding=1),
+            nn.GroupNorm(32, d_model),
+        )
+        self.proj = nn.Sequential(nn.Conv2d(d_model, d_model, kernel_size=(1, 1)), nn.GroupNorm(32, d_model))
+        self.upsample = nn.Sequential(nn.Conv2d(d_model, d_model, kernel_size=(1, 1)), nn.GroupNorm(32, d_model))
+
+        self.depth_head = nn.Sequential(
+            nn.Conv2d(d_model, d_model, kernel_size=(3, 3), padding=1),
+            nn.GroupNorm(32, num_channels=d_model),
+            activation(),
+            nn.Conv2d(d_model, d_model, kernel_size=(3, 3), padding=1),
+            nn.GroupNorm(32, num_channels=d_model),
+            activation(),
+        )
+
+        self.depth_classifier = nn.Conv2d(d_model, depth_num_bins + 1, kernel_size=(1, 1))
+
+        depth_encoder_layer = TransformerEncoderLayer(
+            d_model,
+            nhead=8,
+            dim_feedforward=256,
+            dropout=0.1,
+            activation=activation,
+            normalize_before=False,
+            batch_first=False,
+            key_mask=True,
+        )
+
+        self.depth_encoder = TransformerEncoder(depth_encoder_layer, 1)
+
+        self.depth_pos_embed = nn.Embedding(int(self.depth_max) + 1, 256)
+
+    def forward(
+        self,
+        feature: list[torch.Tensor],
+        mask: torch.Tensor,
+        pos: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Forward pass of the DepthPredictor.
+
+        Args:
+            feature (List[torch.Tensor]): The list of input feature tensors.
+            mask (torch.Tensor): The mask tensor.
+            pos (torch.Tensor): The positional tensor.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: The output tensors.
+                - depth_logits: The depth logits tensor.
+                - depth_embed: The depth embedding tensor.
+                - weighted_depth: The weighted depth tensor.
+                - depth_pos_embed_ip: The interpolated depth positional embedding tensor.
+        """
+        # foreground depth map
+        src_16 = self.proj(feature[1])
+        src_32 = self.upsample(functional.interpolate(feature[2], size=src_16.shape[-2:], mode="bilinear"))
+        src_8 = self.downsample(feature[0])
+        src = (src_8 + src_16 + src_32) / 3
+
+        src = self.depth_head(src)
+        depth_logits = self.depth_classifier(src)
+
+        depth_probs = functional.softmax(depth_logits, dim=1)
+        weighted_depth = (depth_probs * self.depth_bin_values.reshape(1, -1, 1, 1)).sum(dim=1)
+        # depth embeddings with depth positional encodings
+        b, c, h, w = src.shape
+        src = src.flatten(2).permute(2, 0, 1)
+        mask = mask.flatten(1)
+        pos = pos.flatten(2).permute(2, 0, 1)
+
+        depth_embed = self.depth_encoder(src, mask, pos)
+        depth_embed = depth_embed.permute(1, 2, 0).reshape(b, c, h, w)
+        depth_pos_embed_ip = self.interpolate_depth_embed(weighted_depth)
+        depth_embed = depth_embed + depth_pos_embed_ip
+
+        return depth_logits, depth_embed, weighted_depth, depth_pos_embed_ip
+
+    def interpolate_depth_embed(self, depth: torch.Tensor) -> torch.Tensor:
+        """Interpolate depth embeddings based on depth values.
+
+        Args:
+            depth (torch.Tensor): The depth tensor.
+
+        Returns:
+            torch.Tensor: The interpolated depth embeddings.
+        """
+        depth = depth.clamp(min=0, max=self.depth_max)
+        pos = self.interpolate_1d(depth, self.depth_pos_embed)
+        return pos.permute(0, 3, 1, 2)
+
+    def interpolate_1d(self, coord: torch.Tensor, embed: nn.Embedding) -> torch.Tensor:
+        """Interpolate 1D embeddings based on coordinates.
+
+        Args:
+            coord (torch.Tensor): The coordinate tensor.
+            embed (nn.Embedding): The embedding module.
+
+        Returns:
+            torch.Tensor: The interpolated embeddings.
+        """
+        floor_coord = coord.floor()
+        delta = (coord - floor_coord).unsqueeze(-1)
+        floor_coord = floor_coord.long()
+        ceil_coord = (floor_coord + 1).clamp(max=embed.num_embeddings - 1)
+        return embed(floor_coord) * (1 - delta) + embed(ceil_coord) * delta
diff --git a/src/otx/algo/object_detection_3d/heads/depthaware_transformer.py b/src/otx/algo/object_detection_3d/heads/depthaware_transformer.py
new file mode 100644
index 00000000000..ecfe4a5008c
--- /dev/null
+++ b/src/otx/algo/object_detection_3d/heads/depthaware_transformer.py
@@ -0,0 +1,856 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""depth aware transformer head for 3d object detection."""
+from __future__ import annotations
+
+import math
+from typing import Any, Callable, ClassVar
+
+import torch
+from torch import Tensor, nn
+from torch.nn.init import constant_, normal_, xavier_uniform_
+
+from otx.algo.detection.heads.rtdetr_decoder import MLP, MSDeformableAttention
+from otx.algo.detection.utils.utils import inverse_sigmoid
+from otx.algo.object_detection_3d.utils.utils import get_clones
+
+
+def gen_sineembed_for_position(pos_tensor: Tensor) -> Tensor:
+    """Generate sine embeddings for position tensor.
+
+    Args:
+        pos_tensor (Tensor): Position tensor of shape (n_query, bs, num_dims).
+
+    Returns:
+        Tensor: Sine embeddings for position tensor of shape (n_query, bs, embedding_dim).
+    """
+    scale = 2 * math.pi
+    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = 10000 ** (2 * (dim_t // 2) / 128)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+    if pos_tensor.size(-1) == 2:
+        pos = torch.cat((pos_y, pos_x), dim=2)
+    elif pos_tensor.size(-1) == 4:
+        w_embed = pos_tensor[:, :, 2] * scale
+        pos_w = w_embed[:, :, None] / dim_t
+        pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
+
+        h_embed = pos_tensor[:, :, 3] * scale
+        pos_h = h_embed[:, :, None] / dim_t
+        pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
+
+        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+    elif pos_tensor.size(-1) == 6:
+        for i in range(2, 6):  # Compute sine embeds for l, r, t, b
+            embed = pos_tensor[:, :, i] * scale
+            pos_embed = embed[:, :, None] / dim_t
+            pos_embed = torch.stack((pos_embed[:, :, 0::2].sin(), pos_embed[:, :, 1::2].cos()), dim=3).flatten(2)
+            pos = pos_embed if i == 2 else torch.cat((pos, pos_embed), dim=2)
+        pos = torch.cat((pos_y, pos_x, pos), dim=2)
+    else:
+        msg = f"Unknown pos_tensor shape(-1):{pos_tensor.size(-1)}"
+        raise ValueError(msg)
+    return pos
+
+
+class DepthAwareTransformer(nn.Module):
+    """DepthAwareTransformer module."""
+
+    def __init__(
+        self,
+        d_model: int = 256,
+        nhead: int = 8,
+        num_encoder_layers: int = 6,
+        num_decoder_layers: int = 6,
+        dim_feedforward: int = 1024,
+        dropout: float = 0.1,
+        activation: Callable[..., nn.Module] = nn.ReLU,
+        return_intermediate_dec: bool = False,
+        num_feature_levels: int = 4,
+        dec_n_points: int = 4,
+        enc_n_points: int = 4,
+        group_num: int = 11,
+    ) -> None:
+        """Initialize the DepthAwareTransformer module.
+
+        Args:
+            d_model (int): The dimension of the input and output feature vectors.
+            nhead (int): The number of attention heads.
+            num_encoder_layers (int): The number of encoder layers.
+            num_decoder_layers (int): The number of decoder layers.
+            dim_feedforward (int): The dimension of the feedforward network.
+            dropout (float): The dropout rate.
+            activation (Callable[..., nn.Module]): The activation function.
+            return_intermediate_dec (bool): Whether to return intermediate decoder outputs.
+            num_feature_levels (int): The number of feature levels.
+            dec_n_points (int): The number of points for the decoder attention.
+            enc_n_points (int): The number of points for the encoder attention.
+            group_num (int): The number of groups for the two-stage training.
+        """
+        super().__init__()
+
+        self.d_model = d_model
+        self.nhead = nhead
+        self.group_num = group_num
+
+        encoder_layer = VisualEncoderLayer(
+            d_model,
+            dim_feedforward,
+            dropout,
+            activation,
+            num_feature_levels,
+            nhead,
+            enc_n_points,
+        )
+        self.encoder = VisualEncoder(encoder_layer, num_encoder_layers)
+
+        decoder_layer = DepthAwareDecoderLayer(
+            d_model,
+            dim_feedforward,
+            dropout,
+            activation,
+            num_feature_levels,
+            nhead,
+            dec_n_points,
+            group_num=group_num,
+        )
+        self.decoder = DepthAwareDecoder(
+            decoder_layer,
+            num_decoder_layers,
+            return_intermediate_dec,
+            d_model,
+            activation,
+        )
+
+        self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
+        self.reference_points = nn.Linear(d_model, 2)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self) -> None:
+        """Reset parameters of the model."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformableAttention):
+                m._reset_parameters()  # noqa: SLF001
+        xavier_uniform_(self.reference_points.weight.data, gain=1.0)
+        constant_(self.reference_points.bias.data, 0.0)
+        normal_(self.level_embed)
+
+    def get_proposal_pos_embed(self, proposals: Tensor) -> Tensor:
+        """Generate position embeddings for proposal tensor.
+
+        Args:
+            proposals (Tensor): Proposal tensor of shape (N, L, 6).
+
+        Returns:
+            Tensor: Position embeddings for proposal tensor of shape (N, L, embedding_dim).
+        """
+        num_pos_feats = 128
+        temperature = 10000
+        scale = 2 * math.pi
+
+        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 6
+        proposals = proposals.sigmoid() * scale
+        # N, L, 6, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 6, 64, 2
+        return torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+
+    def gen_encoder_output_proposals(
+        self,
+        memory: Tensor,
+        memory_padding_mask: Tensor,
+        spatial_shapes: list[tuple[int, int]],
+    ) -> tuple[Tensor, Tensor]:
+        """Generate encoder output and proposals.
+
+        Args:
+            memory (Tensor): Memory tensor of shape (N, S, C).
+            memory_padding_mask (Tensor): Memory padding mask tensor of shape (N, S).
+            spatial_shapes (List[Tuple[int, int]]): List of spatial shapes.
+
+        Returns:
+            Tuple[Tensor, Tensor]: Encoder output tensor of shape (N, S, C) and proposals tensor of shape (N, L, 6).
+        """
+        n_, _, _ = memory.shape
+        proposals = []
+        _cur = 0
+        for lvl, (h_, w_) in enumerate(spatial_shapes):
+            mask_flatten_ = memory_padding_mask[:, _cur : (_cur + h_ * w_)].view(n_, h_, w_, 1)
+            valid_h = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_w = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = torch.meshgrid(
+                torch.linspace(0, h_ - 1, h_, dtype=torch.float32, device=memory.device),
+                torch.linspace(0, w_ - 1, w_, dtype=torch.float32, device=memory.device),
+            )
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_w.unsqueeze(-1), valid_h.unsqueeze(-1)], 1).view(n_, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(n_, -1, -1, -1) + 0.5) / scale
+
+            lr = torch.ones_like(grid) * 0.05 * (2.0**lvl)
+            tb = torch.ones_like(grid) * 0.05 * (2.0**lvl)
+            wh = torch.cat((lr, tb), -1)
+
+            proposal = torch.cat((grid, wh), -1).view(n_, -1, 6)
+            proposals.append(proposal)
+            _cur += h_ * w_
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float("inf"))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
+
+        output_memory = memory
+        output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
+        output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
+        output_memory = self.enc_output_norm(self.enc_output(output_memory))
+        return output_memory, output_proposals
+
+    def get_valid_ratio(self, mask: Tensor) -> Tensor:
+        """Calculate the valid ratio of the mask.
+
+        Args:
+            mask (Tensor): The mask tensor.
+
+        Returns:
+            Tensor: The valid ratio tensor.
+        """
+        _, h, w = mask.shape
+        valid_h = torch.sum(~mask[:, :, 0], 1)
+        valid_w = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_h.float() / h
+        valid_ratio_w = valid_w.float() / w
+        return torch.stack([valid_ratio_w, valid_ratio_h], -1)
+
+    def forward(
+        self,
+        srcs: list[Tensor],
+        masks: list[Tensor],
+        pos_embeds: list[Tensor],
+        query_embed: Tensor,
+        depth_pos_embed: Tensor,
+        depth_pos_embed_ip: Tensor,
+        attn_mask: Tensor | None = None,
+    ) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor | None, Tensor | None]:
+        """Forward pass of the DepthAwareTransformer module.
+
+        Args:
+            srcs (List[Tensor]): List of source tensors.
+            masks (List[Tensor]): List of mask tensors.
+            pos_embeds (List[Tensor]): List of position embedding tensors.
+            query_embed (Tensor | None): Query embedding tensor. Defaults to None.
+            depth_pos_embed (Tensor | None): Depth position embedding tensor. Defaults to None.
+            depth_pos_embed_ip (Tensor | None): Depth position embedding IP tensor. Defaults to None.
+            attn_mask (Tensor | None): Attention mask tensor. Defaults to None.
+
+        Returns:
+            Tuple[Tensor, Tensor, Tensor, Tensor, Tensor | None, Tensor | None]: Tuple containing the output tensors.
+        """
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes_list = []
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes_list.append(spatial_shape)
+            src_ = src.flatten(2).transpose(1, 2)
+            pos_embed_ = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed_ + self.level_embed[lvl].view(1, 1, -1)
+
+            mask_ = mask.flatten(1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src_)
+            mask_flatten.append(mask_)
+
+        src_flatten = torch.cat(src_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=srcs[0].device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+
+        # encoder
+        memory = self.encoder(
+            src_flatten,
+            spatial_shapes,
+            level_start_index,
+            valid_ratios,
+            lvl_pos_embed_flatten,
+            mask_flatten,
+        )
+        # enc_intermediate_output, enc_intermediate_refpoints = None
+        # prepare input for decoder
+        bs, _, c = memory.shape
+        query_embed, tgt = torch.split(query_embed, c, dim=1)
+        query_embed = query_embed.unsqueeze(0).expand(bs, -1, -1)
+        tgt = tgt.unsqueeze(0).expand(bs, -1, -1)
+        reference_points = self.reference_points(query_embed).sigmoid()
+        init_reference_out = reference_points
+
+        depth_pos_embed = depth_pos_embed.flatten(2).permute(2, 0, 1)
+        depth_pos_embed_ip = depth_pos_embed_ip.flatten(2).permute(2, 0, 1)
+        mask_depth = masks[1].flatten(1)
+
+        # decoder
+        # ipdb.set_trace()
+        hs, inter_references, inter_references_dim = self.decoder(
+            tgt,  # .transpose(1,0), for DINO
+            reference_points,
+            memory,
+            spatial_shapes,
+            level_start_index,
+            valid_ratios,
+            query_embed,  # ,INFo
+            mask_flatten,
+            depth_pos_embed,
+            mask_depth,
+            bs=bs,
+            depth_pos_embed_ip=depth_pos_embed_ip,
+            pos_embeds=pos_embeds,
+            attn_mask=attn_mask,
+        )
+
+        inter_references_out = inter_references
+        inter_references_out_dim = inter_references_dim
+        return hs, init_reference_out, inter_references_out, inter_references_out_dim, None, None
+
+
+class VisualEncoderLayer(nn.Module):
+    """VisualEncoderLayer module."""
+
+    def __init__(
+        self,
+        d_model: int = 256,
+        d_ffn: int = 1024,
+        dropout: float = 0.1,
+        activation: Callable[..., nn.Module] = nn.ReLU,
+        n_levels: int = 4,
+        n_heads: int = 8,
+        n_points: int = 4,
+    ) -> None:
+        """Initialize the DepthAwareDecoderLayer.
+
+        Args:
+            d_model (int): The input and output dimension of the layer. Defaults to 256.
+            d_ffn (int): The hidden dimension of the feed-forward network. Defaults to 1024.
+            dropout (float): The dropout rate. Defaults to 0.1.
+            activation (Callable[..., nn.Module]): The activation function. Defaults to nn.ReLU.
+            n_levels (int): The number of feature levels. Defaults to 4.
+            n_heads (int): The number of attention heads. Defaults to 8.
+            n_points (int): The number of sampling points for the MSDeformableAttention. Defaults to 4.
+        """
+        super().__init__()
+
+        # self attention
+        self.self_attn = MSDeformableAttention(d_model, n_heads, n_levels, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = activation()
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+    @staticmethod
+    def with_pos_embed(tensor: Tensor, pos: Tensor | None) -> Tensor:
+        """Add position embedding to the input tensor.
+
+        Args:
+            tensor (Tensor): The input tensor.
+            pos (Tensor | None): The position embedding tensor. Defaults to None.
+
+        Returns:
+            Tensor: The tensor with position embedding added.
+        """
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, src: Tensor) -> Tensor:
+        """Forward pass of the ffn.
+
+        Args:
+            src (Tensor): The input tensor.
+
+        Returns:
+            Tensor: The output tensor.
+        """
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        return self.norm2(src)
+
+    def forward(
+        self,
+        src: Tensor,
+        pos: Tensor,
+        reference_points: Tensor,
+        spatial_shapes: list[tuple[int, int]],
+        level_start_index: Tensor,
+        padding_mask: Tensor | None = None,
+    ) -> Tensor:
+        """Forward pass of the VisualEncoderLayer.
+
+        Args:
+            src (Tensor): The input tensor.
+            pos (Tensor): The position embedding tensor.
+            reference_points (Tensor): The reference points tensor.
+            spatial_shapes (List[Tuple[int, int]]): The list of spatial shapes.
+            level_start_index (Tensor): The level start index tensor.
+            padding_mask (Optional[Tensor]): The padding mask tensor. Defaults to None.
+
+        Returns:
+            Tensor: The output tensor.
+        """
+        # self attention
+        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, padding_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+
+        # ffn
+        return self.forward_ffn(src)
+
+
+class VisualEncoder(nn.Module):
+    """VisualEncoder module."""
+
+    def __init__(self, encoder_layer: nn.Module, num_layers: int):
+        """Initialize the DepthAwareDecoder.
+
+        Args:
+            encoder_layer (nn.Module): The encoder layer module.
+            num_layers (int): The number of layers.
+        """
+        super().__init__()
+        self.layers = get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+
+    @staticmethod
+    def get_reference_points(
+        spatial_shapes: list[tuple[int, int]],
+        valid_ratios: Tensor,
+        device: torch.device,
+    ) -> Tensor:
+        """Generate reference points for each spatial level.
+
+        Args:
+            spatial_shapes (List[Tuple[int, int]]): The list of spatial shapes.
+            valid_ratios (Tensor): The tensor of valid ratios.
+            device (torch.device): The device to use.
+
+        Returns:
+            Tensor: The tensor of reference points.
+        """
+        reference_points_list = []
+        for lvl, (h_, w_) in enumerate(spatial_shapes):
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(0.5, h_ - 0.5, h_, dtype=torch.float32, device=device),
+                torch.linspace(0.5, w_ - 0.5, w_, dtype=torch.float32, device=device),
+            )
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * h_)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * w_)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        return reference_points[:, :, None] * valid_ratios[:, None]
+
+    def forward(
+        self,
+        src: Tensor,
+        spatial_shapes: list[tuple[int, int]],
+        level_start_index: Tensor,
+        valid_ratios: Tensor,
+        pos: Tensor | None = None,
+        padding_mask: Tensor | None = None,
+        ref_token_index: int | None = None,
+        ref_token_coord: Tensor | None = None,
+    ) -> Tensor:
+        """Forward pass of the VisualEncoder module.
+
+        Args:
+            src (Tensor): The input tensor.
+            spatial_shapes (List[Tuple[int, int]]): The list of spatial shapes.
+            level_start_index (Tensor): The level start index tensor.
+            valid_ratios (Tensor): The tensor of valid ratios.
+            pos (Tensor | None): The position embedding tensor. Defaults to None.
+            padding_mask (Tensor | None): The padding mask tensor. Defaults to None.
+            ref_token_index (int | None): The reference token index. Defaults to None.
+            ref_token_coord (Tensor | None): The reference token coordinates. Defaults to None.
+
+        Returns:
+            Tensor: The output tensor.
+        """
+        output = src
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
+        for _, layer in enumerate(self.layers):
+            output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask)
+
+        return output
+
+
+class DepthAwareDecoderLayer(nn.Module):
+    """DepthAwareDecoderLayer module."""
+
+    def __init__(
+        self,
+        d_model: int = 256,
+        d_ffn: int = 1024,
+        dropout: float = 0.1,
+        activation: Callable[..., nn.Module] = nn.ReLU,
+        n_levels: int = 4,
+        n_heads: int = 8,
+        n_points: int = 4,
+        group_num: int = 1,
+    ) -> None:
+        """Initialize the DepthAwareDecoderLayer.
+
+        Args:
+            d_model (int): The input and output dimension of the layer. Defaults to 256.
+            d_ffn (int): The hidden dimension of the feed-forward network. Defaults to 1024.
+            dropout (float): The dropout rate. Defaults to 0.1.
+            activation (Callable[..., nn.Module]): The activation function. Defaults to nn.ReLU.
+            n_levels (int): The number of feature levels. Defaults to 4.
+            n_heads (int): The number of attention heads. Defaults to 8.
+            n_points (int): The number of sampling points for the MSDeformableAttention. Defaults to 4.
+            group_num (int): The number of groups for training. Defaults to 1.
+        """
+        super().__init__()
+
+        # cross attention
+        self.cross_attn = MSDeformableAttention(d_model, n_heads, n_levels, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # depth cross attention
+        self.cross_attn_depth = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+        self.dropout_depth = nn.Dropout(dropout)
+        self.norm_depth = nn.LayerNorm(d_model)
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = activation()
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+
+        self.group_num = group_num
+
+        # Decoder Self-Attention
+        self.sa_qcontent_proj = nn.Linear(d_model, d_model)
+        self.sa_qpos_proj = nn.Linear(d_model, d_model)
+        self.sa_kcontent_proj = nn.Linear(d_model, d_model)
+        self.sa_kpos_proj = nn.Linear(d_model, d_model)
+        self.sa_v_proj = nn.Linear(d_model, d_model)
+        self.nhead = n_heads
+
+    @staticmethod
+    def with_pos_embed(tensor: Tensor, pos: Tensor | None) -> Tensor:
+        """Add position embedding to the input tensor.
+
+        Args:
+            tensor (Tensor): The input tensor.
+            pos (Tensor | None): The position embedding tensor. Defaults to None.
+
+        Returns:
+            Tensor: The tensor with position embedding added.
+        """
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt: Tensor) -> Tensor:
+        """Forward pass of the ffn.
+
+        Args:
+            tgt (Tensor): The input tensor.
+
+        Returns:
+            Tensor: The output tensor.
+        """
+        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        return self.norm3(tgt)
+
+    def forward(
+        self,
+        tgt: Tensor,
+        query_pos: Tensor,
+        reference_points: Tensor,
+        src: Tensor,
+        src_spatial_shapes: list[tuple[int, int]],
+        level_start_index: Tensor,
+        src_padding_mask: Tensor,
+        depth_pos_embed: Tensor,
+        mask_depth: Tensor,
+        bs: int,
+        query_sine_embed: Tensor | None = None,
+        is_first: bool | None = None,
+        depth_pos_embed_ip: Tensor | None = None,
+        pos_embeds: list[Tensor] | None = None,
+        self_attn_mask: Tensor | None = None,
+        query_pos_un: Tensor | None = None,
+    ) -> Tensor:
+        """Forward pass of the DepthAwareDecoder module.
+
+        Args:
+            tgt (Tensor): The input tensor.
+            query_pos (Tensor): The query position tensor.
+            reference_points (Tensor): The reference points tensor.
+            src (Tensor): The source tensor.
+            src_spatial_shapes (List[Tuple[int, int]]): The list of spatial shapes.
+            level_start_index (Tensor): The level start index tensor.
+            src_padding_mask (Tensor): The source padding mask tensor.
+            depth_pos_embed (Tensor): The depth position embedding tensor.
+            mask_depth (Tensor): The depth mask tensor.
+            bs (int): The batch size.
+            query_sine_embed (Tensor | None): The query sine embedding tensor. Defaults to None.
+            is_first (bool | None): Whether it is the first iteration. Defaults to None.
+            depth_pos_embed_ip (Tensor | None): The depth position embedding tensor for the iterative process.
+                Defaults to None.
+            pos_embeds (List[Tensor] | None): The list of position embedding tensors. Defaults to None.
+            self_attn_mask (Tensor | None): The self-attention mask tensor. Defaults to None.
+            query_pos_un (Tensor | None): The unnormalized query position tensor. Defaults to None.
+
+        Returns:
+            Tensor: The output tensor.
+        """
+        # depth cross attention
+        tgt2 = self.cross_attn_depth(
+            tgt.transpose(0, 1),
+            depth_pos_embed,
+            depth_pos_embed,
+            key_padding_mask=mask_depth,
+        )[0].transpose(0, 1)
+
+        tgt = tgt + self.dropout_depth(tgt2)
+        tgt = self.norm_depth(tgt)
+
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos)
+
+        q_content = self.sa_qcontent_proj(q)
+        q_pos = self.sa_qpos_proj(q)
+        k_content = self.sa_kcontent_proj(k)
+        k_pos = self.sa_kpos_proj(k)
+        v = self.sa_v_proj(tgt)
+        q = q_content + q_pos
+        k = k_content + k_pos
+
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = tgt.transpose(0, 1)
+        num_queries = q.shape[0]
+
+        if self.training:
+            num_noise = num_queries - self.group_num * 50
+            num_queries = self.group_num * 50
+            q_noise = q[:num_noise].repeat(1, self.group_num, 1)
+            k_noise = k[:num_noise].repeat(1, self.group_num, 1)
+            v_noise = v[:num_noise].repeat(1, self.group_num, 1)
+            q = q[num_noise:]
+            k = k[num_noise:]
+            v = v[num_noise:]
+            q = torch.cat(q.split(num_queries // self.group_num, dim=0), dim=1)
+            k = torch.cat(k.split(num_queries // self.group_num, dim=0), dim=1)
+            v = torch.cat(v.split(num_queries // self.group_num, dim=0), dim=1)
+            q = torch.cat([q_noise, q], dim=0)
+            k = torch.cat([k_noise, k], dim=0)
+            v = torch.cat([v_noise, v], dim=0)
+
+        tgt2 = self.self_attn(q, k, v)[0]
+        tgt2 = torch.cat(tgt2.split(bs, dim=1), dim=0).transpose(0, 1) if self.training else tgt2.transpose(0, 1)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        tgt2 = self.cross_attn(
+            self.with_pos_embed(tgt, query_pos),
+            reference_points,
+            src,
+            src_spatial_shapes,
+            src_padding_mask,
+        )
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        return self.forward_ffn(tgt)
+
+
+class DepthAwareDecoder(nn.Module):
+    """DepthAwareDecoder module."""
+
+    def __init__(
+        self,
+        decoder_layer: nn.Module,
+        num_layers: int,
+        return_intermediate: bool,
+        d_model: int,
+        activation: Callable[..., nn.Module] = nn.ReLU,
+    ) -> None:
+        """Initialize the DepthAwareDecoder.
+
+        Args:
+            decoder_layer (nn.Module): The decoder layer module.
+            num_layers (int): The number of layers.
+            return_intermediate (bool, optional): Whether to return intermediate outputs. Defaults to False.
+            d_model (int | None, optional): The input and output dimension of the layer. Defaults to None.
+        """
+        super().__init__()
+        self.layers = get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.return_intermediate = return_intermediate
+
+        self.bbox_embed = None
+        self.dim_embed = None
+        self.class_embed = None
+
+        self.query_scale = MLP(d_model, d_model, d_model, 2, activation=activation)
+        self.ref_point_head = MLP(d_model, d_model, 2, 2, activation=activation)
+
+    def forward(
+        self,
+        tgt: Tensor,
+        reference_points: Tensor,
+        src: Tensor,
+        src_spatial_shapes: list[tuple[int, int]],
+        src_level_start_index: Tensor,
+        src_valid_ratios: Tensor,
+        query_pos: Tensor | None = None,
+        src_padding_mask: Tensor | None = None,
+        depth_pos_embed: Tensor | None = None,
+        mask_depth: Tensor | None = None,
+        bs: int | None = None,
+        depth_pos_embed_ip: Tensor | None = None,
+        pos_embeds: list[Tensor] | None = None,
+        attn_mask: Tensor | None = None,
+    ) -> Tensor:
+        """Forward pass of the DepthAwareDecoder module.
+
+        Args:
+            tgt (Tensor): The input tensor.
+            reference_points (Tensor): The reference points tensor.
+            src (Tensor): The source tensor.
+            src_spatial_shapes (List[Tuple[int, int]]): The list of spatial shapes.
+            src_level_start_index (Tensor): The level start index tensor.
+            src_valid_ratios (Tensor): The tensor of valid ratios.
+            query_pos (Tensor | None): The query position tensor. Defaults to None.
+            src_padding_mask (Tensor | None): The source padding mask tensor. Defaults to None.
+            depth_pos_embed (Tensor | None): The depth position embedding tensor. Defaults to None.
+            mask_depth (Tensor | None): The depth mask tensor. Defaults to None.
+            bs (int | None): The batch size. Defaults to None.
+            depth_pos_embed_ip (Tensor | None): The depth position embedding tensor for the iterative process.
+                Defaults to None.
+            pos_embeds (List[Tensor] | None): The list of position embedding tensors. Defaults to None.
+            attn_mask (Tensor | None): The self-attention mask tensor. Defaults to None.
+
+        Returns:
+            Tensor: The output tensor.
+        """
+        output = tgt
+
+        intermediate = []
+        intermediate_reference_points = []
+        intermediate_reference_dims = []
+        bs = src.shape[0]
+
+        for lid, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 6:
+                reference_points_input = (
+                    reference_points[:, :, None]
+                    * torch.cat([src_valid_ratios, src_valid_ratios, src_valid_ratios], -1)[:, None]
+                )
+            else:
+                if reference_points.shape[-1] != 2:
+                    msg = f"Wrong reference_points shape[-1]:{reference_points.shape[-1]}"
+                    raise ValueError(msg)
+
+                reference_points_input = reference_points[:, :, None] * src_valid_ratios[:, None]
+
+            ###conditional
+            output = layer(
+                output,
+                query_pos,
+                reference_points_input,
+                src,
+                src_spatial_shapes,
+                src_level_start_index,
+                src_padding_mask,
+                depth_pos_embed,
+                mask_depth,
+                bs,
+                query_sine_embed=None,
+                is_first=(lid == 0),
+                depth_pos_embed_ip=depth_pos_embed_ip,
+                pos_embeds=pos_embeds,
+                self_attn_mask=attn_mask,
+                query_pos_un=None,
+            )
+
+            # implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                tmp = self.bbox_embed[lid](output)
+                if reference_points.shape[-1] == 6:
+                    new_reference_points = tmp + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            reference_dims: Tensor
+            if self.dim_embed is not None:
+                reference_dims = self.dim_embed[lid](output)
+
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+                intermediate_reference_dims.append(reference_dims)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(intermediate_reference_points), torch.stack(
+                intermediate_reference_dims,
+            )
+
+        return output, reference_points
+
+
+class DepthAwareTransformerBuilder:
+    """DepthAwareTransformerBuilder."""
+
+    CFG: ClassVar[dict[str, Any]] = {
+        "monodetr_50": {
+            "d_model": 256,
+            "dropout": 0.1,
+            "nhead": 8,
+            "dim_feedforward": 256,
+            "num_encoder_layers": 3,
+            "num_decoder_layers": 3,
+            "return_intermediate_dec": True,
+            "num_feature_levels": 4,
+            "dec_n_points": 4,
+            "enc_n_points": 4,
+        },
+    }
+
+    def __new__(cls, model_name: str) -> DepthAwareTransformer:
+        """Create the DepthAwareTransformer."""
+        return DepthAwareTransformer(**cls.CFG[model_name])
diff --git a/src/otx/algo/object_detection_3d/losses/__init__.py b/src/otx/algo/object_detection_3d/losses/__init__.py
new file mode 100644
index 00000000000..fb407a5a3ad
--- /dev/null
+++ b/src/otx/algo/object_detection_3d/losses/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Loss functions for 3d object detection."""
+
+from .monodetr_loss import MonoDETRCriterion
+
+__all__ = ["MonoDETRCriterion"]
diff --git a/src/otx/algo/object_detection_3d/losses/ddn_loss.py b/src/otx/algo/object_detection_3d/losses/ddn_loss.py
new file mode 100644
index 00000000000..e3a4238be03
--- /dev/null
+++ b/src/otx/algo/object_detection_3d/losses/ddn_loss.py
@@ -0,0 +1,251 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""ddn loss for MonoDETR model."""
+from __future__ import annotations
+
+import math
+
+import torch
+from torch import nn
+
+from otx.algo.common.losses.focal_loss import FocalLoss
+
+
+def compute_fg_mask(
+    gt_boxes2d: torch.Tensor,
+    shape: tuple[int, int],
+    num_gt_per_img: int,
+    downsample_factor: int = 1,
+    device: torch.device | None = None,
+) -> torch.Tensor:
+    """Compute foreground mask for images.
+
+    Args:
+        gt_boxes2d [torch.Tensor(B, N, 4)]: 2D box labels
+        shape [Tuple[int, int]]: Foreground mask desired shape
+        downsample_factor [int]: Downsample factor for image
+        device [torch.device]: Foreground mask desired device
+
+    Returns:
+        fg_mask [torch.Tensor(shape)]: Foreground mask
+    """
+    if device is None:
+        device = torch.device("cpu")
+    fg_mask = torch.zeros(shape, dtype=torch.bool, device=device)
+
+    # Set box corners
+    gt_boxes2d /= downsample_factor
+    gt_boxes2d[:, :2] = torch.floor(gt_boxes2d[:, :2])
+    gt_boxes2d[:, 2:] = torch.ceil(gt_boxes2d[:, 2:])
+    gt_boxes2d = gt_boxes2d.long()
+
+    # Set all values within each box to True
+    gt_boxes2d = gt_boxes2d.split(num_gt_per_img, dim=0)
+    b = len(gt_boxes2d)
+    for i in range(b):
+        for n in range(gt_boxes2d[i].shape[0]):
+            u1, v1, u2, v2 = gt_boxes2d[i][n]
+            fg_mask[i, v1:v2, u1:u2] = True
+
+    return fg_mask
+
+
+class Balancer(nn.Module):
+    """Fixed foreground/background loss balancer."""
+
+    def __init__(self, fg_weight: float, bg_weight: float, downsample_factor: int = 1):
+        """Initialize fixed foreground/background loss balancer.
+
+        Args:
+            fg_weight [float]: Foreground loss weight
+            bg_weight [float]: Background loss weight
+            downsample_factor [int]: Depth map downsample factor
+        """
+        super().__init__()
+        self.fg_weight = fg_weight
+        self.bg_weight = bg_weight
+        self.downsample_factor = downsample_factor
+
+    def forward(
+        self,
+        loss: torch.Tensor,
+        gt_boxes2d: torch.Tensor,
+        num_gt_per_img: int,
+    ) -> tuple[torch.Tensor, dict[float, float]]:
+        """Forward pass.
+
+        Args:
+            loss [torch.Tensor(B, H, W)]: Pixel-wise loss
+            gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing
+
+        Returns:
+            loss [torch.Tensor(1)]: Total loss after foreground/background balancing
+            tb_dict [dict[float]]: All losses to log in tensorboard
+        """
+        # Compute masks
+        fg_mask = compute_fg_mask(
+            gt_boxes2d=gt_boxes2d,
+            shape=loss.shape,
+            num_gt_per_img=num_gt_per_img,
+            downsample_factor=self.downsample_factor,
+            device=loss.device,
+        )
+        bg_mask = ~fg_mask
+
+        # Compute balancing weights
+        weights = self.fg_weight * fg_mask + self.bg_weight * bg_mask
+        num_pixels = fg_mask.sum() + bg_mask.sum()
+
+        # Compute losses
+        loss *= weights
+        fg_loss = loss[fg_mask].sum() / num_pixels
+        bg_loss = loss[bg_mask].sum() / num_pixels
+
+        # return total loss
+        return fg_loss + bg_loss
+
+
+class DDNLoss(nn.Module):
+    """DDNLoss module for computing the loss for MonoDETR model."""
+
+    def __init__(
+        self,
+        alpha: float = 0.25,
+        gamma: float = 2.0,
+        fg_weight: float = 13,
+        bg_weight: float = 1,
+        downsample_factor: int = 1,
+    ) -> None:
+        """Initializes DDNLoss module.
+
+        Args:
+            weight [float]: Loss function weight
+            alpha [float]: Alpha value for Focal Loss
+            gamma [float]: Gamma value for Focal Loss
+            disc_cfg [dict]: Depth discretiziation configuration
+            fg_weight [float]: Foreground loss weight
+            bg_weight [float]: Background loss weight
+            downsample_factor [int]: Depth map downsample factor
+        """
+        super().__init__()
+        self.balancer = Balancer(downsample_factor=downsample_factor, fg_weight=fg_weight, bg_weight=bg_weight)
+
+        # Set loss function
+        self.alpha = alpha
+        self.gamma = gamma
+        self.loss_func = FocalLoss(alpha=self.alpha, gamma=self.gamma, reduction="none")
+
+    def build_target_depth_from_3dcenter(
+        self,
+        depth_logits: torch.Tensor,
+        gt_boxes2d: torch.Tensor,
+        gt_center_depth: torch.Tensor,
+        num_gt_per_img: int,
+    ) -> torch.Tensor:
+        """Builds target depth map from 3D center depth.
+
+        Args:
+            depth_logits: torch.Tensor(B, D+1, H, W)]: Predicted depth logits
+            gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing
+            gt_center_depth [torch.Tensor(B, N)]: 3D center depth
+            num_gt_per_img: [int]: Number of ground truth boxes per image
+        """
+        b, _, h, w = depth_logits.shape
+        depth_maps = torch.zeros((b, h, w), device=depth_logits.device, dtype=depth_logits.dtype)
+
+        # Set box corners
+        gt_boxes2d[:, :2] = torch.floor(gt_boxes2d[:, :2])
+        gt_boxes2d[:, 2:] = torch.ceil(gt_boxes2d[:, 2:])
+        gt_boxes2d = gt_boxes2d.long()
+
+        # Set all values within each box to True
+        gt_boxes2d = gt_boxes2d.split(num_gt_per_img, dim=0)
+        gt_center_depth = gt_center_depth.split(num_gt_per_img, dim=0)
+        b = len(gt_boxes2d)
+        for i in range(b):
+            center_depth_per_batch = gt_center_depth[i]
+            center_depth_per_batch, sorted_idx = torch.sort(center_depth_per_batch, dim=0, descending=True)
+            gt_boxes_per_batch = gt_boxes2d[i][sorted_idx]
+            for n in range(gt_boxes_per_batch.shape[0]):
+                u1, v1, u2, v2 = gt_boxes_per_batch[n]
+                depth_maps[i, v1:v2, u1:u2] = center_depth_per_batch[n]
+
+        return depth_maps
+
+    def bin_depths(
+        self,
+        depth_map: torch.Tensor,
+        mode: str = "LID",
+        depth_min: float = 1e-3,
+        depth_max: float = 60,
+        num_bins: int = 80,
+        target: bool = False,
+    ) -> torch.Tensor:
+        """Converts depth map into bin indices.
+
+        Args:
+            depth_map [torch.Tensor(H, W)]: Depth Map
+            mode [string]: Discretiziation mode (See https://arxiv.org/pdf/2005.13423.pdf for more details)
+                UD: Uniform discretiziation
+                LID: Linear increasing discretiziation
+                SID: Spacing increasing discretiziation
+            depth_min [float]: Minimum depth value
+            depth_max [float]: Maximum depth value
+            num_bins [int]: Number of depth bins
+            target [bool]: Whether the depth bins indices will be used for a target tensor in loss comparison
+
+        Returns:
+            indices [torch.Tensor(H, W)]: Depth bin indices
+        """
+        if mode == "UD":
+            bin_size = (depth_max - depth_min) / num_bins
+            indices = (depth_map - depth_min) / bin_size
+        elif mode == "LID":
+            bin_size = 2 * (depth_max - depth_min) / (num_bins * (1 + num_bins))
+            indices = -0.5 + 0.5 * torch.sqrt(1 + 8 * (depth_map - depth_min) / bin_size)
+        elif mode == "SID":
+            indices = (
+                num_bins
+                * (torch.log(1 + depth_map) - math.log(1 + depth_min))
+                / (math.log(1 + depth_max) - math.log(1 + depth_min))
+            )
+        else:
+            raise NotImplementedError
+
+        if target:
+            # Remove indicies outside of bounds
+            mask = (indices < 0) | (indices > num_bins) | (~torch.isfinite(indices))
+            indices[mask] = num_bins
+
+            # Convert to integer
+            indices = indices.type(torch.int64)
+
+        return indices
+
+    def forward(
+        self,
+        depth_logits: torch.Tensor,
+        gt_boxes2d: torch.Tensor,
+        num_gt_per_img: int,
+        gt_center_depth: torch.Tensor,
+    ) -> torch.Tensor:
+        """Gets depth_map loss.
+
+        Args:
+            depth_logits: torch.Tensor(B, D+1, H, W)]: Predicted depth logits
+            gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing
+            num_gt_per_img: [int]: Number of ground truth boxes per image
+            gt_center_depth: [torch.Tensor(B, N)]: 3D center depth
+
+        Returns:
+            loss [torch.Tensor(1)]: Depth classification network loss
+        """
+        # Bin depth map to create target
+        depth_maps = self.build_target_depth_from_3dcenter(depth_logits, gt_boxes2d, gt_center_depth, num_gt_per_img)
+        depth_target = self.bin_depths(depth_maps, target=True)
+        # Compute loss
+        loss = self.loss_func(depth_logits, depth_target)
+        # Compute foreground/background balancing
+
+        return self.balancer(loss=loss, gt_boxes2d=gt_boxes2d, num_gt_per_img=num_gt_per_img)
diff --git a/src/otx/algo/object_detection_3d/losses/monodetr_loss.py b/src/otx/algo/object_detection_3d/losses/monodetr_loss.py
new file mode 100644
index 00000000000..ebc98d45a51
--- /dev/null
+++ b/src/otx/algo/object_detection_3d/losses/monodetr_loss.py
@@ -0,0 +1,247 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""main loss for MonoDETR model."""
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+import torch
+from torch import nn
+from torch.nn import functional
+from torchvision.ops import box_convert
+
+from otx.algo.common.losses.focal_loss import py_sigmoid_focal_loss
+from otx.algo.common.losses.iou_loss import giou_loss
+from otx.algo.object_detection_3d.matchers.matcher_3d import HungarianMatcher3D
+from otx.algo.object_detection_3d.utils.utils import box_cxcylrtb_to_xyxy
+
+from .ddn_loss import DDNLoss
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+
+class MonoDETRCriterion(nn.Module):
+    """This class computes the loss for MonoDETR."""
+
+    def __init__(self, num_classes: int, weight_dict: dict, focal_alpha: float, group_num: int = 11) -> None:
+        """MonoDETRCriterion.
+
+        Args:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            focal_alpha: alpha in Focal Loss
+            group_num: number of groups for data parallelism
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = HungarianMatcher3D(cost_class=2, cost_3dcenter=10, cost_bbox=5, cost_giou=2)
+        self.weight_dict = weight_dict
+        for name in self.loss_map:
+            if name not in self.weight_dict:
+                self.weight_dict[name] = 1
+        self.focal_alpha = focal_alpha
+        self.ddn_loss = DDNLoss()  # for depth map
+        self.group_num = group_num
+
+    def loss_labels(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
+        """Classification loss."""
+        src_logits = outputs["scores"]
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device)
+
+        target_classes[idx] = target_classes_o.squeeze().long()
+
+        target_classes_onehot = torch.zeros(
+            [src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1],
+            dtype=src_logits.dtype,
+            layout=src_logits.layout,
+            device=src_logits.device,
+        )
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = py_sigmoid_focal_loss(
+            pred=src_logits,
+            target=target_classes_onehot,
+            avg_factor=num_boxes,
+            alpha=self.focal_alpha,
+            reduction="mean",
+        )
+
+        return {"loss_ce": loss_ce}
+
+    def loss_3dcenter(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
+        """Compute the loss for the 3D center prediction."""
+        idx = self._get_src_permutation_idx(indices)
+        src_3dcenter = outputs["boxes_3d"][:, :, 0:2][idx]
+        target_3dcenter = torch.cat([t["boxes_3d"][:, 0:2][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        loss_3dcenter = functional.l1_loss(src_3dcenter, target_3dcenter, reduction="none")
+        return {"loss_center": loss_3dcenter.sum() / num_boxes}
+
+    def loss_boxes(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
+        """Compute l1 loss."""
+        idx = self._get_src_permutation_idx(indices)
+        src_2dboxes = outputs["boxes_3d"][:, :, 2:6][idx]
+        target_2dboxes = torch.cat([t["boxes_3d"][:, 2:6][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        # l1
+        loss_bbox = functional.l1_loss(src_2dboxes, target_2dboxes, reduction="none")
+        return {"loss_bbox": loss_bbox.sum() / num_boxes}
+
+    def loss_giou(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
+        """Compute the GIoU loss."""
+        # giou
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs["boxes_3d"][idx]
+        target_boxes = torch.cat([t["boxes_3d"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        loss_giou = giou_loss(box_cxcylrtb_to_xyxy(src_boxes), box_cxcylrtb_to_xyxy(target_boxes))
+        return {"loss_giou": loss_giou}
+
+    def loss_depths(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
+        """Compute the loss for the depth prediction."""
+        idx = self._get_src_permutation_idx(indices)
+
+        src_depths = outputs["depth"][idx]
+        target_depths = torch.cat([t["depth"][i] for t, (_, i) in zip(targets, indices)], dim=0).squeeze()
+
+        depth_input, depth_log_variance = src_depths[:, 0], src_depths[:, 1]
+        depth_loss = 1.4142 * torch.exp(-depth_log_variance) * torch.abs(depth_input - target_depths) + torch.abs(
+            depth_log_variance,
+        )
+        return {"loss_depth": depth_loss.sum() / num_boxes}
+
+    def loss_dims(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
+        """Compute the loss for the dimension prediction."""
+        idx = self._get_src_permutation_idx(indices)
+        src_dims = outputs["size_3d"][idx]
+        target_dims = torch.cat([t["size_3d"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        dimension = target_dims.clone().detach()
+        dim_loss = torch.abs(src_dims - target_dims)
+        dim_loss /= dimension
+        with torch.no_grad():
+            compensation_weight = functional.l1_loss(src_dims, target_dims) / dim_loss.mean()
+        dim_loss *= compensation_weight
+        return {"loss_dim": dim_loss.sum() / num_boxes}
+
+    def loss_angles(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
+        """Compute the loss for the angle prediction."""
+        idx = self._get_src_permutation_idx(indices)
+        heading_input = outputs["heading_angle"][idx]
+        target_heading_angle = torch.cat([t["heading_angle"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        heading_target_cls = target_heading_angle[:, 0].view(-1).long()
+        heading_target_res = target_heading_angle[:, 1].view(-1)
+
+        heading_input = heading_input.view(-1, 24)
+
+        # classification loss
+        heading_input_cls = heading_input[:, 0:12]
+        cls_loss = functional.cross_entropy(heading_input_cls, heading_target_cls, reduction="none")
+
+        # regression loss
+        heading_input_res = heading_input[:, 12:24]
+        cls_onehot = (
+            torch.zeros(heading_target_cls.shape[0], 12)
+            .to(device=heading_input.device)
+            .scatter_(dim=1, index=heading_target_cls.view(-1, 1), value=1)
+        )
+        heading_input_res = torch.sum(heading_input_res * cls_onehot, 1)
+        reg_loss = functional.l1_loss(heading_input_res, heading_target_res, reduction="none")
+
+        angle_loss = cls_loss + reg_loss
+        return {"loss_angle": angle_loss.sum() / num_boxes}
+
+    def loss_depth_map(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
+        """Depth map loss."""
+        depth_map_logits = outputs["pred_depth_map_logits"]
+
+        num_gt_per_img = [len(t["boxes"]) for t in targets]
+        gt_boxes2d = torch.cat([t["boxes"] for t in targets], dim=0) * torch.tensor(
+            [80, 24, 80, 24],
+            device=depth_map_logits.device,
+        )
+        gt_boxes2d = box_convert(gt_boxes2d, "cxcywh", "xyxy")
+        gt_center_depth = torch.cat([t["depth"] for t in targets], dim=0).squeeze(dim=1)
+        return {"loss_depth_map": self.ddn_loss(depth_map_logits, gt_boxes2d, num_gt_per_img, gt_center_depth)}
+
+    def _get_src_permutation_idx(
+        self,
+        indices: list[tuple[torch.Tensor, torch.Tensor]],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(
+        self,
+        indices: list[tuple[torch.Tensor, torch.Tensor]],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    @property
+    def loss_map(self) -> dict[str, Callable]:
+        """Return the loss map."""
+        return {
+            "loss_ce": self.loss_labels,
+            "loss_bbox": self.loss_boxes,
+            "loss_giou": self.loss_giou,
+            "loss_depth": self.loss_depths,
+            "loss_dim": self.loss_dims,
+            "loss_angle": self.loss_angles,
+            "loss_center": self.loss_3dcenter,
+            "loss_depth_map": self.loss_depth_map,
+        }
+
+    def forward(
+        self,
+        outputs: dict[str, torch.Tensor],
+        targets: list[dict[str, torch.Tensor]],
+    ) -> dict[str, torch.Tensor]:
+        """This performs the loss computation.
+
+        Args:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
+        group_num = self.group_num if self.training else 1
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets, group_num=group_num)
+
+        # Compute the average number of target boxes across all nodes, for normalization purposes
+        num_boxes_int = sum([len(t["labels"]) for t in targets]) * group_num
+        num_boxes = torch.as_tensor([num_boxes_int], dtype=torch.float, device=next(iter(outputs.values())).device)
+        num_boxes = torch.clamp(num_boxes, min=1)
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.loss_map.values():
+            losses.update(loss(outputs, targets, indices, num_boxes))
+
+        losses = {k: losses[k] * self.weight_dict[k] for k in losses}
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "aux_outputs" in outputs:
+            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
+                indices = self.matcher(aux_outputs, targets, group_num=group_num)
+                for name, loss in self.loss_map.items():
+                    if name == "loss_depth_map":
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    l_dict = loss(aux_outputs, targets, indices, num_boxes.item())
+                    l_dict = {k + f"_aux_{i}": v * self.weight_dict[k] for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        return losses
diff --git a/src/otx/algo/object_detection_3d/matchers/__init__.py b/src/otx/algo/object_detection_3d/matchers/__init__.py
new file mode 100644
index 00000000000..4c217a82f7e
--- /dev/null
+++ b/src/otx/algo/object_detection_3d/matchers/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Matchers modules for 3d object detection."""
+
+from .matcher_3d import HungarianMatcher3D
+
+__all__ = ["HungarianMatcher3D"]
diff --git a/src/otx/algo/object_detection_3d/matchers/matcher_3d.py b/src/otx/algo/object_detection_3d/matchers/matcher_3d.py
new file mode 100644
index 00000000000..2e6e7ac8ddf
--- /dev/null
+++ b/src/otx/algo/object_detection_3d/matchers/matcher_3d.py
@@ -0,0 +1,119 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""HungarianMatcher3D module for 3d object detection."""
+
+import numpy as np
+import torch
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+
+from otx.algo.common.utils.bbox_overlaps import bbox_overlaps
+from otx.algo.object_detection_3d.utils.utils import box_cxcylrtb_to_xyxy
+
+
+class HungarianMatcher3D(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network."""
+
+    def __init__(
+        self,
+        cost_class: float = 1.0,
+        cost_3dcenter: float = 1.0,
+        cost_bbox: float = 1.0,
+        cost_giou: float = 1.0,
+    ):
+        """Creates the matcher.
+
+        Args:
+            cost_class (float): This is the relative weight of the classification error in the matching cost.
+            cost_3dcenter (float): This is the relative weight of the L1 error of the 3d center in the matching cost.
+            cost_bbox (float): This is the relative weight of the L1 error of the bbox coordinates in the matching cost.
+            cost_giou (float): This is the relative weight of the giou loss of the bbox in the matching cost.
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_3dcenter = cost_3dcenter
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+
+    @torch.no_grad()
+    def forward(self, outputs: dict, targets: list, group_num: int = 11) -> list:
+        """Performs the matching.
+
+        Args:
+            outputs: This is a dict that contains at least these entries:
+                 "scores": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "boxes_3d": Tensor of dim [batch_size, num_queries, 4] with the predicted 3d box coordinates
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = outputs["boxes_3d"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+
+        out_prob = outputs["scores"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
+        # Also concat the target labels and boxes
+        tgt_ids = torch.cat([v["labels"] for v in targets]).long()
+
+        # Compute the classification cost.
+        alpha = 0.25
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
+
+        out_3dcenter = outputs["boxes_3d"][:, :, 0:2].flatten(0, 1)  # [batch_size * num_queries, 4]
+        tgt_3dcenter = torch.cat([v["boxes_3d"][:, 0:2] for v in targets])
+
+        # Compute the 3dcenter cost between boxes
+        cost_3dcenter = torch.cdist(out_3dcenter, tgt_3dcenter, p=1)
+
+        out_2dbbox = outputs["boxes_3d"][:, :, 2:6].flatten(0, 1)  # [batch_size * num_queries, 4]
+        tgt_2dbbox = torch.cat([v["boxes_3d"][:, 2:6] for v in targets])
+        # Compute the L1 cost between boxes
+        cost_bbox = torch.cdist(out_2dbbox, tgt_2dbbox, p=1)
+
+        # Compute the giou cost betwen boxes
+        out_bbox = outputs["boxes_3d"].flatten(0, 1)  # [batch_size * num_queries, 4]
+        tgt_bbox = torch.cat([v["boxes_3d"] for v in targets])
+        cost_giou = -bbox_overlaps(
+            box_cxcylrtb_to_xyxy(out_bbox),
+            box_cxcylrtb_to_xyxy(tgt_bbox),
+            mode="giou",
+        )
+        # Final cost matrix
+        c = (
+            self.cost_bbox * cost_bbox
+            + self.cost_3dcenter * cost_3dcenter
+            + self.cost_class * cost_class
+            + self.cost_giou * cost_giou
+        )
+        c = c.view(bs, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        # indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
+        indices = []
+        g_num_queries = num_queries // group_num
+        c_list = c.split(g_num_queries, dim=1)
+        for g_i in range(group_num):
+            c_g = c_list[g_i]
+            indices_g = [linear_sum_assignment(c[i]) for i, c in enumerate(c_g.split(sizes, -1))]
+            if g_i == 0:
+                indices = indices_g
+            else:
+                indices = [
+                    (
+                        np.concatenate([indice1[0], indice2[0] + g_num_queries * g_i]),
+                        np.concatenate([indice1[1], indice2[1]]),
+                    )
+                    for indice1, indice2 in zip(indices, indices_g)
+                ]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
diff --git a/src/otx/algo/object_detection_3d/monodetr3d.py b/src/otx/algo/object_detection_3d/monodetr3d.py
new file mode 100644
index 00000000000..2ea42e52f95
--- /dev/null
+++ b/src/otx/algo/object_detection_3d/monodetr3d.py
@@ -0,0 +1,249 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""MonoDetr model implementations."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+import torch
+from torch import Tensor
+from torchvision.ops import box_convert
+
+from otx.algo.object_detection_3d.backbones.monodetr_resnet import BackboneBuilder
+from otx.algo.object_detection_3d.detectors.monodetr import MonoDETR
+from otx.algo.object_detection_3d.heads.depth_predictor import DepthPredictor
+from otx.algo.object_detection_3d.heads.depthaware_transformer import DepthAwareTransformerBuilder
+from otx.algo.object_detection_3d.losses import MonoDETRCriterion
+from otx.algo.object_detection_3d.utils.utils import box_cxcylrtb_to_xyxy
+from otx.core.data.entity.base import OTXBatchLossEntity
+from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity, Det3DBatchPredEntity
+from otx.core.exporter.base import OTXModelExporter
+from otx.core.exporter.detection_3d import OTXObjectDetection3DExporter
+from otx.core.model.detection_3d import OTX3DDetectionModel
+
+
+class MonoDETR3D(OTX3DDetectionModel):
+    """OTX Detection model class for MonoDETR3D."""
+
+    mean: tuple[float, float, float] = (0.485, 0.456, 0.406)
+    std: tuple[float, float, float] = (0.229, 0.224, 0.225)
+    input_size: tuple[int, int] = (384, 1280)  # HxW
+    load_from: str | None = None
+
+    def _build_model(self, num_classes: int) -> MonoDETR:
+        # backbone
+        backbone = BackboneBuilder(self.model_name)
+        # transformer
+        depthaware_transformer = DepthAwareTransformerBuilder(self.model_name)
+        # depth prediction module
+        depth_predictor = DepthPredictor(depth_num_bins=80, depth_min=1e-3, depth_max=60.0, hidden_dim=256)
+        # criterion
+        loss_weight_dict = {
+            "loss_ce": 2,
+            "loss_bbox": 5,
+            "loss_giou": 2,
+            "loss_center": 10,
+        }
+        criterion = MonoDETRCriterion(num_classes=num_classes, focal_alpha=0.25, weight_dict=loss_weight_dict)
+
+        return MonoDETR(
+            backbone,
+            depthaware_transformer,
+            depth_predictor,
+            num_classes=num_classes,
+            criterion=criterion,
+            num_queries=50,
+            aux_loss=True,
+            num_feature_levels=4,
+            with_box_refine=True,
+            init_box=False,
+        )
+
+    def _customize_inputs(
+        self,
+        entity: Det3DBatchDataEntity,
+    ) -> dict[str, Any]:
+        # prepare bboxes for the model
+        targets_list = []
+        img_sizes = torch.from_numpy(np.array([img_info.ori_shape for img_info in entity.imgs_info])).to(
+            device=entity.images.device,
+        )
+        key_list = ["labels", "boxes", "depth", "size_3d", "heading_angle", "boxes_3d"]
+        for bz in range(len(entity.imgs_info)):
+            target_dict = {}
+            for key in key_list:
+                target_dict[key] = getattr(entity, key)[bz]
+            targets_list.append(target_dict)
+
+        return {
+            "images": entity.images,
+            "calibs": torch.cat([p2.unsqueeze(0) for p2 in entity.calib_matrix], dim=0),
+            "targets": targets_list,
+            "img_sizes": img_sizes,
+            "mode": "loss" if self.training else "predict",
+        }
+
+    def _customize_outputs(
+        self,
+        outputs: dict[str, torch.Tensor],
+        inputs: Det3DBatchDataEntity,
+    ) -> Det3DBatchPredEntity | OTXBatchLossEntity:
+        if self.training:
+            if not isinstance(outputs, dict):
+                raise TypeError(outputs)
+
+            losses = OTXBatchLossEntity()
+            for k, v in outputs.items():
+                if isinstance(v, list):
+                    losses[k] = sum(v)
+                elif isinstance(v, Tensor):
+                    losses[k] = v
+                else:
+                    msg = "Loss output should be list or torch.tensor but got {type(v)}"
+                    raise TypeError(msg)
+            return losses
+
+        labels, scores, size_3d, heading_angle, boxes_3d, depth = self.extract_dets_from_outputs(outputs)
+        # bbox 2d decoding
+        boxes_2d = box_cxcylrtb_to_xyxy(boxes_3d)
+        xywh_2d = box_convert(boxes_2d, "xyxy", "cxcywh")
+        # size 2d decoding
+        size_2d = xywh_2d[:, :, 2:4]
+
+        return Det3DBatchPredEntity(
+            batch_size=inputs.batch_size,
+            images=inputs.images,
+            imgs_info=inputs.imgs_info,
+            calib_matrix=inputs.calib_matrix,
+            boxes=boxes_2d,
+            labels=labels,
+            boxes_3d=boxes_3d,
+            size_2d=size_2d,
+            size_3d=size_3d,
+            depth=depth,
+            heading_angle=heading_angle,
+            scores=scores,
+            original_kitti_format=[None],
+        )
+
+    def configure_optimizers(self) -> tuple[list[torch.optim.Optimizer], list[dict[str, Any]]]:
+        """Configure an optimizer and learning-rate schedulers.
+
+        Configure an optimizer and learning-rate schedulers
+        from the given optimizer and scheduler or scheduler list callable in the constructor.
+        Generally, there is two lr schedulers. One is for a linear warmup scheduler and
+        the other is the main scheduler working after the warmup period.
+
+        Returns:
+            Two list. The former is a list that contains an optimizer
+            The latter is a list of lr scheduler configs which has a dictionary format.
+        """
+        param_groups = self._apply_no_bias_decay()
+        optimizer = self.optimizer_callable(param_groups)
+        schedulers = self.scheduler_callable(optimizer)
+
+        def ensure_list(item: Any) -> list:  # noqa: ANN401
+            return item if isinstance(item, list) else [item]
+
+        lr_scheduler_configs = []
+        for scheduler in ensure_list(schedulers):
+            lr_scheduler_config = {"scheduler": scheduler}
+            if hasattr(scheduler, "interval"):
+                lr_scheduler_config["interval"] = scheduler.interval
+            if hasattr(scheduler, "monitor"):
+                lr_scheduler_config["monitor"] = scheduler.monitor
+            lr_scheduler_configs.append(lr_scheduler_config)
+
+        return [optimizer], lr_scheduler_configs
+
+    def _apply_no_bias_decay(self) -> list[dict[str, Any]]:
+        """Apply no bias decay to bias parameters."""
+        weights, biases = [], []
+        for name, param in self.named_parameters():
+            if "bias" in name:
+                biases += [param]
+            else:
+                weights += [param]
+
+        return [{"params": biases, "weight_decay": 0}, {"params": weights, "weight_decay": 0.0001}]
+
+    def forward_for_tracing(
+        self,
+        images: torch.Tensor,
+        calib_matrix: torch.Tensor,
+        img_sizes: torch.Tensor,
+    ) -> dict[str, torch.Tensor]:
+        """Model forward function used for the model tracing during model exportation."""
+        return self.model(images=images, calibs=calib_matrix, img_sizes=img_sizes, mode="export")
+
+    @staticmethod
+    def extract_dets_from_outputs(outputs: dict[str, torch.Tensor], topk: int = 50) -> tuple[torch.Tensor, ...]:
+        """Extract detection results from model outputs."""
+        # b, q, c
+        out_logits = outputs["scores"]
+        out_bbox = outputs["boxes_3d"]
+
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), topk, dim=1)
+
+        # final scores
+        scores = topk_values
+        # final indexes
+        topk_boxes = (topk_indexes // out_logits.shape[2]).unsqueeze(-1)
+        # final labels
+        labels = topk_indexes % out_logits.shape[2]
+
+        heading = outputs["heading_angle"]
+        size_3d = outputs["size_3d"]
+        depth = outputs["depth"]
+        # decode boxes
+        boxes_3d = torch.gather(out_bbox, 1, topk_boxes.repeat(1, 1, 6))  # b, q', 4
+        # heading angle decoding
+        heading = torch.gather(heading, 1, topk_boxes.repeat(1, 1, 24))
+        # depth decoding
+        depth = torch.gather(depth, 1, topk_boxes.repeat(1, 1, 2))
+        # 3d dims decoding
+        size_3d = torch.gather(size_3d, 1, topk_boxes.repeat(1, 1, 3))
+        # 2d boxes of the corners decoding
+
+        return labels, scores, size_3d, heading, boxes_3d, depth
+
+    @property
+    def _exporter(self) -> OTXModelExporter:
+        """Creates OTXModelExporter object that can export the model."""
+        if self.input_size is None:
+            msg = f"Input size attribute is not set for {self.__class__}"
+            raise ValueError(msg)
+
+        return OTXObjectDetection3DExporter(
+            task_level_export_parameters=self._export_parameters,
+            input_size=(1, 3, *self.input_size),
+            mean=self.mean,
+            std=self.std,
+            resize_mode="standard",
+            swap_rgb=False,
+            via_onnx=False,
+            onnx_export_configuration={
+                "input_names": ["images", "calib_matrix", "img_sizes"],
+                "dynamic_axes": {
+                    "images": {0: "batch"},
+                    "boxes_3d": {0: "batch", 1: "num_dets"},
+                    "scores": {0: "batch", 1: "num_dets"},
+                    "heading_angle": {0: "batch", 1: "num_dets"},
+                    "depth": {0: "batch", 1: "num_dets"},
+                    "size_3d": {0: "batch", 1: "num_dets"},
+                },
+                "autograd_inlining": False,
+                "opset_version": 16,
+            },
+            input_names=["images", "calib_matrix", "img_sizes"],
+            output_names=["scores", "boxes_3d", "size_3d", "heading_angle", "depth"],
+        )
+
+    @property
+    def _optimization_config(self) -> dict[str, Any]:
+        """PTQ config for MonoDETR."""
+        return {"model_type": "transformer"}
diff --git a/src/otx/algo/object_detection_3d/utils/__init__.py b/src/otx/algo/object_detection_3d/utils/__init__.py
new file mode 100644
index 00000000000..c951fff3de8
--- /dev/null
+++ b/src/otx/algo/object_detection_3d/utils/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""utils module for object detection 3D models."""
diff --git a/src/otx/algo/object_detection_3d/utils/utils.py b/src/otx/algo/object_detection_3d/utils/utils.py
new file mode 100644
index 00000000000..6f9c009d697
--- /dev/null
+++ b/src/otx/algo/object_detection_3d/utils/utils.py
@@ -0,0 +1,66 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""utils for object detection 3D models."""
+from __future__ import annotations
+
+import copy
+
+import torch
+from torch import Tensor, nn
+
+
+# TODO(Kirill): try to remove this class
+class NestedTensor:
+    """Nested tensor class for object detection 3D models."""
+
+    def __init__(self, tensors: Tensor, mask: Tensor) -> None:
+        """Initialize a NestedTensor object.
+
+        Args:
+            tensors (Tensor): The tensors representing the nested structure.
+            mask (Tensor): The mask indicating the valid elements in the tensors.
+        """
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device: torch.device) -> NestedTensor:
+        """Move the NestedTensor object to the specified device.
+
+        Args:
+            device: The device to move the tensors to.
+
+        Returns:
+            NestedTensor: The NestedTensor object with tensors moved to the specified device.
+        """
+        cast_tensor = self.tensors.to(device)
+        cast_mask = self.mask.to(device) if self.mask is not None else None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self) -> tuple[Tensor, Tensor]:
+        """Decompose the NestedTensor object into its constituent tensors and masks."""
+        return self.tensors, self.mask
+
+    def __repr__(self) -> str:
+        """Return a string representation of the NestedTensor object."""
+        return str(self.tensors)
+
+
+def box_cxcylrtb_to_xyxy(x: Tensor) -> Tensor:
+    """Transform bbox from cxcylrtb to xyxy representation."""
+    x_c, y_c, k, r, t, b = x.unbind(-1)
+    bb = [(x_c - k), (y_c - t), (x_c + r), (y_c + b)]
+    return torch.stack(bb, dim=-1)
+
+
+def get_clones(module: nn.Module, n: int) -> nn.ModuleList:
+    """Create a list of cloned modules.
+
+    Args:
+        module (nn.Module): The module to be cloned.
+        N (int): The number of clones to create.
+
+    Returns:
+        nn.ModuleList: The list of cloned modules.
+    """
+    return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
diff --git a/src/otx/core/data/dataset/object_detection_3d.py b/src/otx/core/data/dataset/object_detection_3d.py
new file mode 100644
index 00000000000..7e7f294c58b
--- /dev/null
+++ b/src/otx/core/data/dataset/object_detection_3d.py
@@ -0,0 +1,307 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Module for OTX3DObjectDetectionDataset."""
+
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+from copy import deepcopy
+from functools import partial
+from typing import TYPE_CHECKING, Any, Callable, List, Union
+
+import numpy as np
+import torch
+from datumaro import Image
+from PIL import Image as PILImage
+from torchvision import tv_tensors
+
+from otx.core.data.dataset.utils.kitti_utils import Calibration, affine_transform, angle2class, get_affine_transform
+from otx.core.data.entity.base import ImageInfo
+from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity, Det3DDataEntity
+from otx.core.data.mem_cache import NULL_MEM_CACHE_HANDLER, MemCacheHandlerBase
+from otx.core.data.transform_libs.torchvision import Compose
+from otx.core.types.image import ImageColorChannel
+
+from .base import OTXDataset
+
+if TYPE_CHECKING:
+    from datumaro import Bbox, DatasetSubset
+
+
+Transforms = Union[Compose, Callable, List[Callable], dict[str, Compose | Callable | List[Callable]]]
+
+
+class OTX3DObjectDetectionDataset(OTXDataset[Det3DDataEntity]):
+    """OTXDataset class for detection task."""
+
+    def __init__(
+        self,
+        dm_subset: DatasetSubset,
+        transforms: Transforms,
+        mem_cache_handler: MemCacheHandlerBase = NULL_MEM_CACHE_HANDLER,
+        mem_cache_img_max_size: tuple[int, int] | None = None,
+        max_refetch: int = 1000,
+        image_color_channel: ImageColorChannel = ImageColorChannel.RGB,
+        stack_images: bool = True,
+        to_tv_image: bool = True,
+        max_objects: int = 50,
+        depth_threshold: int = 65,
+        resolution: tuple[int, int] = (1280, 384),  # (W, H)
+    ) -> None:
+        super().__init__(
+            dm_subset,
+            transforms,
+            mem_cache_handler,
+            mem_cache_img_max_size,
+            max_refetch,
+            image_color_channel,
+            stack_images,
+            to_tv_image,
+        )
+        self.max_objects = max_objects
+        self.depth_threshold = depth_threshold
+        self.resolution = np.array(resolution)  # TODO(Kirill): make it configurable
+        self.subset_type = list(self.dm_subset.get_subset_info())[-1].split(":")[0]
+
+    def _get_item_impl(self, index: int) -> Det3DDataEntity | None:
+        entity = self.dm_subset[index]
+        image = entity.media_as(Image)
+        image = self._get_img_data_and_shape(image)[0]
+        calib = Calibration(entity.attributes["calib_path"])
+        original_kitti_format = None  # don't use for training
+        if self.subset_type != "train":
+            # TODO (Kirill): remove this or duplication of the inputs
+            annotations_copy = deepcopy(entity.annotations)
+            original_kitti_format = [obj.attributes for obj in annotations_copy]
+            # decode original kitti format for metric calculation
+            for i, anno_dict in enumerate(original_kitti_format):
+                anno_dict["name"] = self.label_info.label_names[annotations_copy[i].label]
+                anno_dict["bbox"] = annotations_copy[i].points
+                dimension = anno_dict["dimensions"]
+                anno_dict["dimensions"] = [dimension[2], dimension[0], dimension[1]]
+            original_kitti_format = self._reformate_for_kitti_metric(original_kitti_format)
+        # decode labels for training
+        inputs, targets, ori_img_shape = self._decode_item(
+            PILImage.fromarray(image),
+            entity.annotations,
+            calib,
+        )
+        # normilize image
+        inputs = self._apply_transforms(torch.as_tensor(inputs, dtype=torch.float32))
+        return Det3DDataEntity(
+            image=inputs,
+            img_info=ImageInfo(
+                img_idx=index,
+                img_shape=inputs.shape[1:],
+                ori_shape=ori_img_shape,  # TODO(Kirill): curently we use WxH here, make it HxW
+                image_color_channel=self.image_color_channel,
+                ignored_labels=[],
+            ),
+            boxes=tv_tensors.BoundingBoxes(
+                targets["boxes"],
+                format=tv_tensors.BoundingBoxFormat.XYXY,
+                canvas_size=inputs.shape[1:],
+                dtype=torch.float32,
+            ),
+            labels=torch.as_tensor(targets["labels"], dtype=torch.long),
+            calib_matrix=torch.as_tensor(calib.P2, dtype=torch.float32),
+            boxes_3d=torch.as_tensor(targets["boxes_3d"], dtype=torch.float32),
+            size_2d=torch.as_tensor(targets["size_2d"], dtype=torch.float32),
+            size_3d=torch.as_tensor(targets["size_3d"], dtype=torch.float32),
+            depth=torch.as_tensor(targets["depth"], dtype=torch.float32),
+            heading_angle=torch.as_tensor(
+                np.concatenate([targets["heading_bin"], targets["heading_res"]], axis=1),
+                dtype=torch.float32,
+            ),
+            original_kitti_format=original_kitti_format,
+        )
+
+    @property
+    def collate_fn(self) -> Callable:
+        """Collection function to collect DetDataEntity into DetBatchDataEntity in data loader."""
+        return partial(Det3DBatchDataEntity.collate_fn, stack_images=self.stack_images)
+
+    def _decode_item(self, img: PILImage, annotations: list[Bbox], calib: Calibration) -> tuple:  # noqa: C901
+        """Decode item for training."""
+        # data augmentation for image
+        img_size = np.array(img.size)
+        bbox2d = np.array([ann.points for ann in annotations])
+        center = img_size / 2
+        crop_size, crop_scale = img_size, 1
+        random_flip_flag = False
+        # TODO(Kirill): add data augmentation for 3d, remove them from here.
+        if self.subset_type == "train":
+            if np.random.random() < 0.5:
+                random_flip_flag = True
+                img = img.transpose(PILImage.FLIP_LEFT_RIGHT)
+
+            if np.random.random() < 0.5:
+                scale = 0.05
+                shift = 0.05
+                crop_scale = np.clip(np.random.randn() * scale + 1, 1 - scale, 1 + scale)
+                crop_size = img_size * crop_scale
+                center[0] += img_size[0] * np.clip(np.random.randn() * shift, -2 * shift, 2 * shift)
+                center[1] += img_size[1] * np.clip(np.random.randn() * shift, -2 * shift, 2 * shift)
+
+        # add affine transformation for 2d images.
+        trans, trans_inv = get_affine_transform(center, crop_size, 0, self.resolution, inv=1)
+        img = img.transform(
+            tuple(self.resolution.tolist()),
+            method=PILImage.AFFINE,
+            data=tuple(trans_inv.reshape(-1).tolist()),
+            resample=PILImage.BILINEAR,
+        )
+        img = np.array(img).astype(np.float32)
+        img = img.transpose(2, 0, 1)  # C * H * W -> (384 * 1280)
+        #  ============================   get labels   ==============================
+        # data augmentation for labels
+        annotations_list: list[dict[str, Any]] = [ann.attributes for ann in annotations]
+        for i, obj in enumerate(annotations_list):
+            obj["label"] = annotations[i].label
+            obj["location"] = np.array(obj["location"])
+
+        if random_flip_flag:
+            for i in range(bbox2d.shape[0]):
+                [x1, _, x2, _] = bbox2d[i]
+                bbox2d[i][0], bbox2d[i][2] = img_size[0] - x2, img_size[0] - x1
+                annotations_list[i]["alpha"] = np.pi - annotations_list[i]["alpha"]
+                annotations_list[i]["rotation_y"] = np.pi - annotations_list[i]["rotation_y"]
+                if annotations_list[i]["alpha"] > np.pi:
+                    annotations_list[i]["alpha"] -= 2 * np.pi  # check range
+                if annotations_list[i]["alpha"] < -np.pi:
+                    annotations_list[i]["alpha"] += 2 * np.pi
+                if annotations_list[i]["rotation_y"] > np.pi:
+                    annotations_list[i]["rotation_y"] -= 2 * np.pi
+                if annotations_list[i]["rotation_y"] < -np.pi:
+                    annotations_list[i]["rotation_y"] += 2 * np.pi
+
+        # labels encoding
+        mask_2d = np.zeros((self.max_objects), dtype=bool)
+        labels = np.zeros((self.max_objects), dtype=np.int8)
+        depth = np.zeros((self.max_objects, 1), dtype=np.float32)
+        heading_bin = np.zeros((self.max_objects, 1), dtype=np.int64)
+        heading_res = np.zeros((self.max_objects, 1), dtype=np.float32)
+        size_2d = np.zeros((self.max_objects, 2), dtype=np.float32)
+        size_3d = np.zeros((self.max_objects, 3), dtype=np.float32)
+        src_size_3d = np.zeros((self.max_objects, 3), dtype=np.float32)
+        boxes = np.zeros((self.max_objects, 4), dtype=np.float32)
+        boxes_3d = np.zeros((self.max_objects, 6), dtype=np.float32)
+
+        object_num = len(annotations) if len(annotations) < self.max_objects else self.max_objects
+        for i in range(object_num):
+            cur_obj = annotations_list[i]
+            # ignore the samples beyond the threshold [hard encoding]
+            if cur_obj["location"][-1] > self.depth_threshold and cur_obj["location"][-1] < 2:
+                continue
+
+            # process 2d bbox & get 2d center
+            bbox_2d = bbox2d[i].copy()
+
+            # add affine transformation for 2d boxes.
+            bbox_2d[:2] = affine_transform(bbox_2d[:2], trans)
+            bbox_2d[2:] = affine_transform(bbox_2d[2:], trans)
+
+            # process 3d center
+            center_2d = np.array(
+                [(bbox_2d[0] + bbox_2d[2]) / 2, (bbox_2d[1] + bbox_2d[3]) / 2],
+                dtype=np.float32,
+            )  # W * H
+            corner_2d = bbox_2d.copy()
+
+            center_3d = np.array(
+                cur_obj["location"]
+                + [
+                    0,
+                    -cur_obj["dimensions"][0] / 2,
+                    0,
+                ],
+            )  # real 3D center in 3D space
+            center_3d = center_3d.reshape(-1, 3)  # shape adjustment (N, 3)
+            center_3d, _ = calib.rect_to_img(center_3d)  # project 3D center to image plane
+            center_3d = center_3d[0]  # shape adjustment
+            if random_flip_flag:  # random flip for center3d
+                center_3d[0] = img_size[0] - center_3d[0]
+            center_3d = affine_transform(center_3d.reshape(-1), trans)
+
+            # filter 3d center out of img
+            proj_inside_img = True
+
+            if center_3d[0] < 0 or center_3d[0] >= self.resolution[0]:
+                proj_inside_img = False
+            if center_3d[1] < 0 or center_3d[1] >= self.resolution[1]:
+                proj_inside_img = False
+
+            if not proj_inside_img:
+                continue
+
+            # class
+            labels[i] = cur_obj["label"]
+
+            # encoding 2d/3d boxes
+            w, h = bbox_2d[2] - bbox_2d[0], bbox_2d[3] - bbox_2d[1]
+            size_2d[i] = 1.0 * w, 1.0 * h
+
+            center_2d_norm = center_2d / self.resolution
+            size_2d_norm = size_2d[i] / self.resolution
+
+            corner_2d_norm = corner_2d
+            corner_2d_norm[0:2] = corner_2d[0:2] / self.resolution
+            corner_2d_norm[2:4] = corner_2d[2:4] / self.resolution
+            center_3d_norm = center_3d / self.resolution
+
+            k, r = center_3d_norm[0] - corner_2d_norm[0], corner_2d_norm[2] - center_3d_norm[0]
+            t, b = center_3d_norm[1] - corner_2d_norm[1], corner_2d_norm[3] - center_3d_norm[1]
+
+            if k < 0 or r < 0 or t < 0 or b < 0:
+                continue
+
+            boxes[i] = center_2d_norm[0], center_2d_norm[1], size_2d_norm[0], size_2d_norm[1]
+            boxes_3d[i] = center_3d_norm[0], center_3d_norm[1], k, r, t, b
+
+            # encoding depth
+            depth[i] = cur_obj["location"][-1] * crop_scale
+
+            # encoding heading angle
+            heading_angle = calib.ry2alpha(cur_obj["rotation_y"], (bbox2d[i][0] + bbox2d[i][2]) / 2)
+            if heading_angle > np.pi:
+                heading_angle -= 2 * np.pi  # check range
+            if heading_angle < -np.pi:
+                heading_angle += 2 * np.pi
+            heading_bin[i], heading_res[i] = angle2class(heading_angle)
+
+            # encoding size_3d
+            src_size_3d[i] = np.array([cur_obj["dimensions"]], dtype=np.float32)
+            size_3d[i] = src_size_3d[i]
+
+            # filter out the samples with truncated or occluded
+            if cur_obj["truncated"] <= 0.5 and cur_obj["occluded"] <= 2:
+                mask_2d[i] = 1
+
+        # collect return data
+        targets_for_train = {
+            "labels": labels[mask_2d],
+            "boxes": boxes[mask_2d],
+            "boxes_3d": boxes_3d[mask_2d],
+            "depth": depth[mask_2d],
+            "size_2d": size_2d[mask_2d],
+            "size_3d": size_3d[mask_2d],
+            "heading_bin": heading_bin[mask_2d],
+            "heading_res": heading_res[mask_2d],
+        }
+
+        return img, targets_for_train, img_size
+
+    def _reformate_for_kitti_metric(self, annotations: dict[str, Any]) -> dict[str, np.array]:
+        """Reformat the annotation for KITTI metric."""
+        return {
+            "name": np.array([obj["name"] for obj in annotations]),
+            "alpha": np.array([obj["alpha"] for obj in annotations]),
+            "bbox": np.array([obj["bbox"] for obj in annotations]).reshape(-1, 4),
+            "dimensions": np.array([obj["dimensions"] for obj in annotations]).reshape(-1, 3),
+            "location": np.array([obj["location"] for obj in annotations]).reshape(-1, 3),
+            "rotation_y": np.array([obj["rotation_y"] for obj in annotations]),
+            "occluded": np.array([obj["occluded"] for obj in annotations]),
+            "truncated": np.array([obj["truncated"] for obj in annotations]),
+        }
diff --git a/src/otx/core/data/dataset/segmentation.py b/src/otx/core/data/dataset/segmentation.py
index 363a15e84cc..ee23be6090e 100644
--- a/src/otx/core/data/dataset/segmentation.py
+++ b/src/otx/core/data/dataset/segmentation.py
@@ -14,7 +14,6 @@
 from datumaro.components.annotation import Ellipse, Image, Mask, Polygon
 from torchvision import tv_tensors
 
-from otx.core.data.dataset.base import Transforms
 from otx.core.data.entity.base import ImageInfo
 from otx.core.data.entity.segmentation import SegBatchDataEntity, SegDataEntity
 from otx.core.data.mem_cache import NULL_MEM_CACHE_HANDLER, MemCacheHandlerBase
@@ -27,6 +26,8 @@
     from datumaro import Dataset as DmDataset
     from datumaro import DatasetItem
 
+    from otx.core.data.dataset.base import Transforms
+
 
 # NOTE: It is copied from https://github.com/openvinotoolkit/datumaro/pull/1409
 # It will be replaced in the future.
diff --git a/src/otx/core/data/dataset/utils/__init__.py b/src/otx/core/data/dataset/utils/__init__.py
new file mode 100644
index 00000000000..0c75fd7a904
--- /dev/null
+++ b/src/otx/core/data/dataset/utils/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Module defines utils for OTXDatasets."""
diff --git a/src/otx/core/data/dataset/utils/kitti_utils.py b/src/otx/core/data/dataset/utils/kitti_utils.py
new file mode 100644
index 00000000000..1ee16c41733
--- /dev/null
+++ b/src/otx/core/data/dataset/utils/kitti_utils.py
@@ -0,0 +1,299 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Module defines utils for KITTI Dataset."""
+
+# flake8: noqa
+# mypy: ignore-errors
+
+import cv2
+import numpy as np
+
+
+def get_calib_from_file(calib_file):
+    with open(calib_file) as f:
+        lines = f.readlines()
+
+    obj = lines[2].strip().split(" ")[1:]
+    P2 = np.array(obj, dtype=np.float32)
+    obj = lines[3].strip().split(" ")[1:]
+    P3 = np.array(obj, dtype=np.float32)
+    obj = lines[4].strip().split(" ")[1:]
+    R0 = np.array(obj, dtype=np.float32)
+    obj = lines[5].strip().split(" ")[1:]
+    Tr_velo_to_cam = np.array(obj, dtype=np.float32)
+
+    return {
+        "P2": P2.reshape(3, 4),
+        "P3": P3.reshape(3, 4),
+        "R0": R0.reshape(3, 3),
+        "Tr_velo2cam": Tr_velo_to_cam.reshape(3, 4),
+    }
+
+
+class Calibration:
+    def __init__(self, calib_file):
+        if isinstance(calib_file, str):
+            calib = get_calib_from_file(calib_file)
+        else:
+            calib = calib_file
+
+        self.P2 = calib["P2"]  # 3 x 4
+        self.R0 = calib["R0"]  # 3 x 3
+        self.V2C = calib["Tr_velo2cam"]  # 3 x 4
+        self.C2V = self.inverse_rigid_trans(self.V2C)
+
+        # Camera intrinsics and extrinsics
+        self.cu = self.P2[0, 2]
+        self.cv = self.P2[1, 2]
+        self.fu = self.P2[0, 0]
+        self.fv = self.P2[1, 1]
+        self.tx = self.P2[0, 3] / (-self.fu)
+        self.ty = self.P2[1, 3] / (-self.fv)
+
+    def cart_to_hom(self, pts):
+        """:param pts: (N, 3 or 2)
+        :return pts_hom: (N, 4 or 3)
+        """
+        pts_hom = np.hstack((pts, np.ones((pts.shape[0], 1), dtype=np.float32)))
+        return pts_hom
+
+    def lidar_to_rect(self, pts_lidar):
+        """:param pts_lidar: (N, 3)
+        :return pts_rect: (N, 3)
+        """
+        pts_lidar_hom = self.cart_to_hom(pts_lidar)
+        pts_rect = np.dot(pts_lidar_hom, np.dot(self.V2C.T, self.R0.T))
+        # pts_rect = reduce(np.dot, (pts_lidar_hom, self.V2C.T, self.R0.T))
+        return pts_rect
+
+    def rect_to_lidar(self, pts_rect):
+        pts_ref = np.transpose(np.dot(np.linalg.inv(self.R0), np.transpose(pts_rect)))
+        pts_ref = self.cart_to_hom(pts_ref)  # nx4
+        return np.dot(pts_ref, np.transpose(self.C2V))
+
+    def rect_to_img(self, pts_rect):
+        """:param pts_rect: (N, 3)
+        :return pts_img: (N, 2)
+        """
+        pts_rect_hom = self.cart_to_hom(pts_rect)
+        pts_2d_hom = np.dot(pts_rect_hom, self.P2.T)
+        pts_img = (pts_2d_hom[:, 0:2].T / pts_rect_hom[:, 2]).T  # (N, 2)
+        pts_rect_depth = pts_2d_hom[:, 2] - self.P2.T[3, 2]  # depth in rect camera coord
+        return pts_img, pts_rect_depth
+
+    def lidar_to_img(self, pts_lidar):
+        """:param pts_lidar: (N, 3)
+        :return pts_img: (N, 2)
+        """
+        pts_rect = self.lidar_to_rect(pts_lidar)
+        pts_img, pts_depth = self.rect_to_img(pts_rect)
+        return pts_img, pts_depth
+
+    def img_to_rect(self, u, v, depth_rect):
+        """:param u: (N)
+        :param v: (N)
+        :param depth_rect: (N)
+        :return:
+        """
+        x = ((u - self.cu) * depth_rect) / self.fu + self.tx
+        y = ((v - self.cv) * depth_rect) / self.fv + self.ty
+        pts_rect = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1), depth_rect.reshape(-1, 1)), axis=1)
+        return pts_rect
+
+    def depthmap_to_rect(self, depth_map):
+        """:param depth_map: (H, W), depth_map
+        :return:
+        """
+        x_range = np.arange(0, depth_map.shape[1])
+        y_range = np.arange(0, depth_map.shape[0])
+        x_idxs, y_idxs = np.meshgrid(x_range, y_range)
+        x_idxs, y_idxs = x_idxs.reshape(-1), y_idxs.reshape(-1)
+        depth = depth_map[y_idxs, x_idxs]
+        pts_rect = self.img_to_rect(x_idxs, y_idxs, depth)
+        return pts_rect, x_idxs, y_idxs
+
+    def corners3d_to_img_boxes(self, corners3d):
+        """:param corners3d: (N, 8, 3) corners in rect coordinate
+        :return: boxes: (None, 4) [x1, y1, x2, y2] in rgb coordinate
+        :return: boxes_corner: (None, 8) [xi, yi] in rgb coordinate
+        """
+        sample_num = corners3d.shape[0]
+        corners3d_hom = np.concatenate((corners3d, np.ones((sample_num, 8, 1))), axis=2)  # (N, 8, 4)
+
+        img_pts = np.matmul(corners3d_hom, self.P2.T)  # (N, 8, 3)
+
+        x, y = img_pts[:, :, 0] / img_pts[:, :, 2], img_pts[:, :, 1] / img_pts[:, :, 2]
+        x1, y1 = np.min(x, axis=1), np.min(y, axis=1)
+        x2, y2 = np.max(x, axis=1), np.max(y, axis=1)
+
+        boxes = np.concatenate((x1.reshape(-1, 1), y1.reshape(-1, 1), x2.reshape(-1, 1), y2.reshape(-1, 1)), axis=1)
+        boxes_corner = np.concatenate((x.reshape(-1, 8, 1), y.reshape(-1, 8, 1)), axis=2)
+
+        return boxes, boxes_corner
+
+    def camera_dis_to_rect(self, u, v, d):
+        """Can only process valid u, v, d, which means u, v can not beyond the image shape, reprojection error 0.02
+        :param u: (N)
+        :param v: (N)
+        :param d: (N), the distance between camera and 3d points, d^2 = x^2 + y^2 + z^2
+        :return:
+        """
+        assert self.fu == self.fv, "%.8f != %.8f" % (self.fu, self.fv)
+        fd = np.sqrt((u - self.cu) ** 2 + (v - self.cv) ** 2 + self.fu**2)
+        x = ((u - self.cu) * d) / fd + self.tx
+        y = ((v - self.cv) * d) / fd + self.ty
+        z = np.sqrt(d**2 - x**2 - y**2)
+        pts_rect = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1), z.reshape(-1, 1)), axis=1)
+        return pts_rect
+
+    def inverse_rigid_trans(self, Tr):
+        """Inverse a rigid body transform matrix (3x4 as [R|t])
+        [R'|-R't; 0|1]
+        """
+        inv_Tr = np.zeros_like(Tr)  # 3x4
+        inv_Tr[0:3, 0:3] = np.transpose(Tr[0:3, 0:3])
+        inv_Tr[0:3, 3] = np.dot(-np.transpose(Tr[0:3, 0:3]), Tr[0:3, 3])
+        return inv_Tr
+
+    def alpha2ry(self, alpha, u):
+        """Get rotation_y by alpha + theta - 180
+        alpha : Observation angle of object, ranging [-pi..pi]
+        x : Object center x to the camera center (x-W/2), in pixels
+        rotation_y : Rotation ry around Y-axis in camera coordinates [-pi..pi]
+        """
+        ry = alpha + np.arctan2(u - self.cu, self.fu)
+
+        if ry > np.pi:
+            ry -= 2 * np.pi
+        if ry < -np.pi:
+            ry += 2 * np.pi
+
+        return ry
+
+    def ry2alpha(self, ry, u):
+        alpha = ry - np.arctan2(u - self.cu, self.fu)
+
+        if alpha > np.pi:
+            alpha -= 2 * np.pi
+        if alpha < -np.pi:
+            alpha += 2 * np.pi
+
+        return alpha
+
+    def flip(self, img_size):
+        wsize = 4
+        hsize = 2
+        p2ds = (
+            np.concatenate(
+                [
+                    np.expand_dims(np.tile(np.expand_dims(np.linspace(0, img_size[0], wsize), 0), [hsize, 1]), -1),
+                    np.expand_dims(np.tile(np.expand_dims(np.linspace(0, img_size[1], hsize), 1), [1, wsize]), -1),
+                    np.linspace(2, 78, wsize * hsize).reshape(hsize, wsize, 1),
+                ],
+                -1,
+            )
+        ).reshape(-1, 3)
+        p3ds = self.img_to_rect(p2ds[:, 0:1], p2ds[:, 1:2], p2ds[:, 2:3])
+        p3ds[:, 0] *= -1
+        p2ds[:, 0] = img_size[0] - p2ds[:, 0]
+
+        # self.P2[0,3] *= -1
+        cos_matrix = np.zeros([wsize * hsize, 2, 7])
+        cos_matrix[:, 0, 0] = p3ds[:, 0]
+        cos_matrix[:, 0, 1] = cos_matrix[:, 1, 2] = p3ds[:, 2]
+        cos_matrix[:, 1, 0] = p3ds[:, 1]
+        cos_matrix[:, 0, 3] = cos_matrix[:, 1, 4] = 1
+        cos_matrix[:, :, -2] = -p2ds[:, :2]
+        cos_matrix[:, :, -1] = -p2ds[:, :2] * p3ds[:, 2:3]
+        new_calib = np.linalg.svd(cos_matrix.reshape(-1, 7))[-1][-1]
+        new_calib /= new_calib[-1]
+
+        new_calib_matrix = np.zeros([4, 3]).astype(np.float32)
+        new_calib_matrix[0, 0] = new_calib_matrix[1, 1] = new_calib[0]
+        new_calib_matrix[2, 0:2] = new_calib[1:3]
+        new_calib_matrix[3, :] = new_calib[3:6]
+        new_calib_matrix[-1, -1] = self.P2[-1, -1]
+        self.P2 = new_calib_matrix.T
+        self.cu = self.P2[0, 2]
+        self.cv = self.P2[1, 2]
+        self.fu = self.P2[0, 0]
+        self.fv = self.P2[1, 1]
+        self.tx = self.P2[0, 3] / (-self.fu)
+        self.ty = self.P2[1, 3] / (-self.fv)
+
+
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+
+    return src_result
+
+
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+
+
+def get_affine_transform(center, scale, rot, output_size, shift=np.array([0, 0], dtype=np.float32), inv=0):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        scale = np.array([scale, scale], dtype=np.float32)
+
+    scale_tmp = scale
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, dst_w * -0.5], np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5], np.float32) + dst_dir
+
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+        trans_inv = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+        return trans, trans_inv
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+    return trans
+
+
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.0], dtype=np.float32).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+
+
+def angle2class(angle):
+    """Convert continuous angle to discrete class and residual."""
+    num_heading_bin = 12
+    angle = angle % (2 * np.pi)
+    assert angle >= 0 and angle <= 2 * np.pi
+    angle_per_class = 2 * np.pi / float(num_heading_bin)
+    shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi)
+    class_id = int(shifted_angle / angle_per_class)
+    residual_angle = shifted_angle - (class_id * angle_per_class + angle_per_class / 2)
+    return class_id, residual_angle
+
+
+def class2angle(cls: int, residual: float, to_label_format: bool = False) -> float:
+    """Inverse function to angle2class."""
+    num_heading_bin = 12
+    angle_per_class = 2 * np.pi / float(num_heading_bin)
+    angle_center = cls * angle_per_class
+    angle = angle_center + residual
+    if to_label_format and angle > np.pi:
+        angle = angle - 2 * np.pi
+    return angle
diff --git a/src/otx/core/data/entity/object_detection_3d.py b/src/otx/core/data/entity/object_detection_3d.py
new file mode 100644
index 00000000000..564ea283a60
--- /dev/null
+++ b/src/otx/core/data/entity/object_detection_3d.py
@@ -0,0 +1,148 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Module for OTX detection data entities."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+
+from torchvision import tv_tensors
+
+from otx.core.data.entity.base import (
+    OTXBatchDataEntity,
+    OTXBatchPredEntity,
+    OTXDataEntity,
+    OTXPredEntity,
+)
+from otx.core.data.entity.utils import register_pytree_node
+from otx.core.types.task import OTXTaskType
+
+if TYPE_CHECKING:
+    from torch import LongTensor, Tensor
+
+
+@register_pytree_node
+@dataclass
+class Det3DDataEntity(OTXDataEntity):
+    """Data entity for detection task.
+
+    :param bboxes: Bbox annotations as top-left-bottom-right
+        (x1, y1, x2, y2) format with absolute coordinate values
+    :param labels: Bbox labels as integer indices
+    """
+
+    @property
+    def task(self) -> OTXTaskType:
+        """OTX Task type definition."""
+        return OTXTaskType.OBJECT_DETECTION_3D
+
+    boxes: tv_tensors.BoundingBoxes
+    calib_matrix: Tensor
+    boxes_3d: Tensor
+    size_2d: Tensor
+    size_3d: Tensor
+    depth: Tensor
+    heading_angle: Tensor
+    labels: LongTensor
+    original_kitti_format: list[dict[str, Any]] | None
+
+
+@dataclass
+class Det3DPredEntity(OTXPredEntity, Det3DDataEntity):
+    """Data entity to represent the detection model output prediction."""
+
+
+@dataclass
+class Det3DBatchDataEntity(OTXBatchDataEntity[Det3DDataEntity]):
+    """Data entity for detection task.
+
+    :param bboxes: A list of bbox annotations as top-left-bottom-right
+        (x1, y1, x2, y2) format with absolute coordinate values
+    :param labels: A list of bbox labels as integer indices
+    """  # TODO(Kirill): UPDATE!
+
+    images: Tensor
+    boxes: list[tv_tensors.BoundingBoxes]
+    calib_matrix: list[Tensor]
+    boxes_3d: list[Tensor]
+    size_2d: list[Tensor]
+    size_3d: list[Tensor]
+    depth: list[Tensor]
+    heading_angle: list[Tensor]
+    labels: list[LongTensor]
+    original_kitti_format: list[list[dict[str, Any]] | None]
+
+    @property
+    def task(self) -> OTXTaskType:
+        """OTX Task type definition."""
+        return OTXTaskType.OBJECT_DETECTION_3D
+
+    @classmethod
+    def collate_fn(
+        cls,
+        entities: list[Det3DDataEntity],
+        stack_images: bool = True,
+    ) -> Det3DBatchDataEntity:
+        """Collection function to collect `DetDataEntity` into `DetBatchDataEntity` in data loader.
+
+        Args:
+            entities: List of `DetDataEntity`.
+            stack_images: If True, return 4D B x C x H x W image tensor.
+                Otherwise return a list of 3D C x H x W image tensor.
+
+        Returns:
+            Collated `DetBatchDataEntity`
+        """
+        batch_data = super().collate_fn(entities, stack_images=stack_images)
+        batch_input_shape = tuple(batch_data.images[0].size()[-2:])
+        for info in batch_data.imgs_info:
+            info.batch_input_shape = batch_input_shape
+        return Det3DBatchDataEntity(
+            batch_size=batch_data.batch_size,
+            images=batch_data.images,
+            imgs_info=batch_data.imgs_info,
+            boxes=[entity.boxes for entity in entities],
+            labels=[entity.labels for entity in entities],
+            calib_matrix=[entity.calib_matrix for entity in entities],
+            boxes_3d=[entity.boxes_3d for entity in entities],
+            size_2d=[entity.size_2d for entity in entities],
+            size_3d=[entity.size_3d for entity in entities],
+            depth=[entity.depth for entity in entities],
+            heading_angle=[entity.heading_angle for entity in entities],
+            original_kitti_format=[entity.original_kitti_format for entity in entities],
+        )
+
+    def pin_memory(self) -> Det3DBatchDataEntity:
+        """Pin memory for member tensor variables."""
+        return (
+            super()
+            .pin_memory()
+            .wrap(
+                boxes=[tv_tensors.wrap(bbox.pin_memory(), like=bbox) for bbox in self.boxes],
+                labels=[label.pin_memory() for label in self.labels],
+                calib_matrix=[calib_matrix.pin_memory() for calib_matrix in self.calib_matrix],
+                boxes_3d=[boxes_3d.pin_memory() for boxes_3d in self.boxes_3d],
+                size_2d=[size_2d.pin_memory() for size_2d in self.size_2d],
+                size_3d=[size_3d.pin_memory() for size_3d in self.size_3d],
+                depth=[depth.pin_memory() for depth in self.depth],
+                heading_angle=[heading_angle.pin_memory() for heading_angle in self.heading_angle],
+                original_kitti_format=self.original_kitti_format,
+            )
+        )
+
+
+@dataclass
+class Det3DBatchPredEntity(OTXBatchPredEntity, Det3DBatchDataEntity):
+    """Data entity to represent model output predictions for detection task."""
+
+    boxes: tv_tensors.BoundingBoxes
+    scores: Tensor
+    calib_matrix: Tensor
+    boxes_3d: Tensor
+    size_2d: Tensor
+    size_3d: Tensor
+    depth: Tensor
+    heading_angle: Tensor
+    labels: Tensor
diff --git a/src/otx/core/data/factory.py b/src/otx/core/data/factory.py
index 5d424673fee..92f836c5605 100644
--- a/src/otx/core/data/factory.py
+++ b/src/otx/core/data/factory.py
@@ -156,4 +156,9 @@ def create(  # noqa: PLR0911
 
             return OTXKeypointDetectionDataset(**common_kwargs)
 
+        if task == OTXTaskType.OBJECT_DETECTION_3D:
+            from .dataset.object_detection_3d import OTX3DObjectDetectionDataset
+
+            return OTX3DObjectDetectionDataset(**common_kwargs)
+
         raise NotImplementedError(task)
diff --git a/src/otx/core/data/module.py b/src/otx/core/data/module.py
index 259fa6f6447..f9b7cac8fd4 100644
--- a/src/otx/core/data/module.py
+++ b/src/otx/core/data/module.py
@@ -184,6 +184,7 @@ def __init__(  # noqa: PLR0913
         )
 
         label_infos: list[LabelInfo] = []
+
         for name, dm_subset in dataset.subsets().items():
             if name not in config_mapping:
                 log.warning(f"{name} is not available. Skip it")
@@ -209,7 +210,6 @@ def __init__(  # noqa: PLR0913
                     tile_config=self.tile_config,
                 )
             self.subsets[name] = dataset
-
             label_infos += [self.subsets[name].label_info]
             log.info(f"Add name: {name}, self.subsets: {self.subsets}")
 
diff --git a/src/otx/core/data/pre_filtering.py b/src/otx/core/data/pre_filtering.py
index f78d8fe1db2..b3898a78f04 100644
--- a/src/otx/core/data/pre_filtering.py
+++ b/src/otx/core/data/pre_filtering.py
@@ -72,7 +72,11 @@ def is_valid_annot(item: DatasetItem, annotation: Annotation) -> bool:  # noqa:
     return True
 
 
-def remove_unused_labels(dataset: DmDataset, data_format: str, ignore_index: int | None) -> DmDataset:
+def remove_unused_labels(
+    dataset: DmDataset,
+    data_format: str,
+    ignore_index: int | None,
+) -> DmDataset:
     """Remove unused labels in Datumaro dataset."""
     original_categories: list[str] = dataset.get_label_cat_names()
     used_labels: list[int] = list({ann.label for item in dataset for ann in item.annotations})
@@ -99,4 +103,5 @@ def remove_unused_labels(dataset: DmDataset, data_format: str, ignore_index: int
         mapping = {original_categories[idx]: original_categories[idx] for idx in used_labels}
     msg = "There are unused labels in dataset, they will be filtered out before training."
     warnings.warn(msg, stacklevel=2)
+
     return dataset.transform("remap_labels", mapping=mapping, default="delete")
diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py
index 1beb7fec87d..0bdb4a48baa 100644
--- a/src/otx/core/data/utils/utils.py
+++ b/src/otx/core/data/utils/utils.py
@@ -10,7 +10,8 @@
 
 import cv2
 import numpy as np
-from datumaro.components.annotation import AnnotationType, Bbox, Polygon, _Shape
+from datumaro.components.annotation import AnnotationType, Bbox, Polygon
+from datumaro.components.annotation import Shape as _Shape
 
 from otx.core.types import OTXTaskType
 
diff --git a/src/otx/core/exporter/base.py b/src/otx/core/exporter/base.py
index 8a9cbd8d1de..85d77fe4799 100644
--- a/src/otx/core/exporter/base.py
+++ b/src/otx/core/exporter/base.py
@@ -57,6 +57,7 @@ def __init__(
         pad_value: int = 0,
         swap_rgb: bool = False,
         output_names: list[str] | None = None,
+        input_names: list[str] | None = None,
     ) -> None:
         self.input_size = input_size
         self.mean = mean
@@ -66,6 +67,7 @@ def __init__(
         self.swap_rgb = swap_rgb
         self.task_level_export_parameters = task_level_export_parameters
         self.output_names = output_names
+        self.input_names = input_names
 
     @property
     def metadata(self) -> dict[tuple[str, str], str]:
@@ -319,6 +321,40 @@ def _postprocess_openvino_model(self, exported_model: openvino.Model) -> openvin
                 )
                 raise RuntimeError(msg)
 
+        if self.input_names is not None:
+            if len(exported_model.inputs) >= len(self.input_names):
+                if len(exported_model.inputs) != len(self.input_names):
+                    msg = (
+                        "Number of model inputs is greater than the number"
+                        " of input names to assign. Please check input_names"
+                        " argument of the exporter's constructor."
+                    )
+                    log.warning(msg)
+
+                for i, name in enumerate(self.input_names):
+                    traced_names = exported_model.inputs[i].get_names()
+                    name_found = False
+                    for traced_name in traced_names:
+                        if name in traced_name:
+                            name_found = True
+                            break
+                    name_found = name_found and bool(len(traced_names))
+
+                    if not name_found:
+                        msg = (
+                            f"{name} is not matched with the converted model's traced input names: {traced_names}."
+                            " Please check input_names argument of the exporter's constructor."
+                        )
+                        log.warning(msg)
+
+                    exported_model.inputs[i].tensor.set_names({name})
+            else:
+                msg = (
+                    "Model has less inputs than the number of input names provided: "
+                    f"{len(exported_model.inputs)} vs {len(self.input_names)}"
+                )
+                raise RuntimeError(msg)
+
         if self.metadata is not None:
             export_metadata = self._extend_model_metadata(self.metadata)
             exported_model = self._embed_openvino_ir_metadata(exported_model, export_metadata)
diff --git a/src/otx/core/exporter/detection_3d.py b/src/otx/core/exporter/detection_3d.py
new file mode 100644
index 00000000000..17b1377436a
--- /dev/null
+++ b/src/otx/core/exporter/detection_3d.py
@@ -0,0 +1,100 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Class definition for object detection 3D model exporter used in OTX."""
+
+from __future__ import annotations
+
+import logging as log
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import onnx
+import openvino
+import torch
+
+from otx.core.exporter.native import OTXNativeModelExporter
+from otx.core.types.precision import OTXPrecisionType
+
+if TYPE_CHECKING:
+    from otx.core.model.base import OTXModel
+
+
+class OTXObjectDetection3DExporter(OTXNativeModelExporter):
+    """Class definition for object detection 3D model exporter used in OTX."""
+
+    def to_openvino(
+        self,
+        model: OTXModel,
+        output_dir: Path,
+        base_model_name: str = "exported_model",
+        precision: OTXPrecisionType = OTXPrecisionType.FP32,
+    ) -> Path:
+        """Export to OpenVINO Intermediate Representation format.
+
+        In this implementation the export is done only via standard OV/ONNX tools.
+        """
+        device = next(model.parameters()).device
+        dummy_tensor = torch.rand(self.input_size).to(device)
+        dummy_calib_matrix = torch.rand(1, 3, 4).to(device)
+        dummy_image_sizes = torch.tensor([self.input_size[::-1][:2]]).to(device)
+
+        exported_model = openvino.convert_model(
+            model,
+            example_input={"images": dummy_tensor, "calib_matrix": dummy_calib_matrix, "img_sizes": dummy_image_sizes},
+            input=(
+                openvino.runtime.PartialShape(self.input_size),
+                openvino.runtime.PartialShape([1, 3, 4]),
+                openvino.runtime.PartialShape([1, 2]),
+            ),
+        )
+        exported_model = self._postprocess_openvino_model(exported_model)
+
+        save_path = output_dir / (base_model_name + ".xml")
+        openvino.save_model(exported_model, save_path, compress_to_fp16=(precision == OTXPrecisionType.FP16))
+        log.info("Converting to OpenVINO is done.")
+
+        return Path(save_path)
+
+    def to_onnx(
+        self,
+        model: OTXModel,
+        output_dir: Path,
+        base_model_name: str = "exported_model",
+        precision: OTXPrecisionType = OTXPrecisionType.FP32,
+        embed_metadata: bool = True,
+    ) -> Path:
+        """Export the given PyTorch model to ONNX format and save it to the specified output directory.
+
+        Args:
+            model (OTXModel): The PyTorch model to be exported.
+            output_dir (Path): The directory where the ONNX model will be saved.
+            base_model_name (str, optional): The base name for the exported model. Defaults to "exported_model".
+            precision (OTXPrecisionType, optional): The precision type for the exported model.
+            Defaults to OTXPrecisionType.FP32.
+            embed_metadata (bool, optional): Whether to embed metadata in the ONNX model. Defaults to True.
+
+        Returns:
+            Path: The path to the saved ONNX model.
+        """
+        dummy_tensor = torch.rand(self.input_size)
+        dummy_calib_matrix = torch.rand(1, 3, 4)
+        dummy_image_sizes = torch.tensor([self.input_size[::-1][:2]])
+        dummy_inputs = {"images": dummy_tensor, "calib_matrix": dummy_calib_matrix, "img_sizes": dummy_image_sizes}
+
+        save_path = str(output_dir / (base_model_name + ".onnx"))
+
+        torch.onnx.export(
+            model,
+            args=tuple(dummy_inputs.values()),
+            f=save_path,
+            **self.onnx_export_configuration,
+        )
+
+        onnx_model = onnx.load(save_path)
+        onnx_model = self._postprocess_onnx_model(onnx_model, embed_metadata, precision)
+
+        onnx.save(onnx_model, save_path)
+        log.info("Converting to ONNX is done.")
+
+        return Path(save_path)
diff --git a/src/otx/core/exporter/native.py b/src/otx/core/exporter/native.py
index 11f90b9451d..5f901ede2b5 100644
--- a/src/otx/core/exporter/native.py
+++ b/src/otx/core/exporter/native.py
@@ -37,6 +37,7 @@ def __init__(
         via_onnx: bool = False,
         onnx_export_configuration: dict[str, Any] | None = None,
         output_names: list[str] | None = None,
+        input_names: list[str] | None = None,
     ) -> None:
         super().__init__(
             task_level_export_parameters=task_level_export_parameters,
@@ -47,6 +48,7 @@ def __init__(
             pad_value=pad_value,
             swap_rgb=swap_rgb,
             output_names=output_names,
+            input_names=input_names,
         )
         self.via_onnx = via_onnx
         self.onnx_export_configuration = onnx_export_configuration if onnx_export_configuration is not None else {}
diff --git a/src/otx/core/metrics/average_precision_3d.py b/src/otx/core/metrics/average_precision_3d.py
new file mode 100644
index 00000000000..7b8530ba684
--- /dev/null
+++ b/src/otx/core/metrics/average_precision_3d.py
@@ -0,0 +1,67 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Module for OTX  metric used for 3D object detection tasks."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from torch import Tensor
+from torchmetrics import Metric
+
+from otx.core.metrics.kitti_3d_eval import get_coco_eval_result
+
+if TYPE_CHECKING:
+    import numpy as np
+
+    from otx.core.types.label import LabelInfo
+
+
+class KittiMetric(Metric):
+    """Computes the 2D/3D average precision (coco style) for object detection 3d task.
+
+    Args:
+        label_info (int): Dataclass including label information.
+    """
+
+    def __init__(
+        self,
+        label_info: LabelInfo,
+    ):
+        super().__init__()
+
+        self.label_info: LabelInfo = label_info
+        self.reset()
+
+    def reset(self) -> None:
+        """Reset for every validation and test epoch.
+
+        Please be careful that some variables should not be reset for each epoch.
+        """
+        super().reset()
+        self.preds: list[dict[str, np.array]] = []
+        self.targets: list[dict[str, np.array]] = []
+
+    def update(self, preds: list[dict[str, Tensor]], target: list[dict[str, Tensor]]) -> None:
+        """Update total predictions and targets from given batch predicitons and targets."""
+        self.preds.extend(preds)
+        self.targets.extend(target)
+
+    def compute(self) -> dict:
+        """Compute metrics for 3d object detection."""
+        current_classes = self.label_info.label_names
+        map_bbox, map_3d = get_coco_eval_result(
+            self.targets,
+            self.preds,
+            current_classes=[curcls.lower() for curcls in current_classes],
+        )
+        # use moderate difficulty as final score. Average across all calsses.
+        return {"mAP_bbox_3d": Tensor([map_3d[:, 1].mean()]), "mAP_bbox_2d": Tensor([map_bbox[:, 1].mean()])}
+
+
+def _kitti_metric_measure_callable(label_info: LabelInfo) -> KittiMetric:
+    return KittiMetric(label_info=label_info)
+
+
+KittiMetricCallable = _kitti_metric_measure_callable
diff --git a/src/otx/core/metrics/kitti_3d_eval/__init__.py b/src/otx/core/metrics/kitti_3d_eval/__init__.py
new file mode 100644
index 00000000000..236c84981f2
--- /dev/null
+++ b/src/otx/core/metrics/kitti_3d_eval/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Module for kitti 3d evaluation."""
+
+from .eval import get_coco_eval_result
+
+__all__ = ["get_coco_eval_result"]
diff --git a/src/otx/core/metrics/kitti_3d_eval/eval.py b/src/otx/core/metrics/kitti_3d_eval/eval.py
new file mode 100644
index 00000000000..951cc96538d
--- /dev/null
+++ b/src/otx/core/metrics/kitti_3d_eval/eval.py
@@ -0,0 +1,811 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""KITTI 3D eval for OTX."""
+
+# flake8: noqa
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+import io as sysio
+from typing import Any
+
+import numba
+import numpy as np
+import torch
+
+if torch.cuda.is_available():
+    from .rotate_gpu_iou import rotate_iou_eval_gpu as rotate_iou_eval
+else:
+    from .rotate_iou import rotate_iou_eval_cpu as rotate_iou_eval
+
+
+@numba.jit(nopython=True)
+def get_thresholds(
+    scores: np.ndarray,  # 1D array of confidence scores
+    num_gt: int,  # Number of ground truth objects
+    num_sample_pts: int = 41,  # Number of sample points used to compute recall thresholds
+) -> np.ndarray:  # 1D array of recall thresholds
+    """Compute recall thresholds for a given score array.
+
+    Args:
+        scores (np.ndarray): 1D array of confidence scores.
+        num_gt (int): Number of ground truth objects.
+        num_sample_pts (int, optional): Number of sample points used to
+            compute recall thresholds. Defaults to 41.
+
+    Returns:
+        np.ndarray: 1D array of recall thresholds.
+    """
+    scores.sort()
+    scores = scores[::-1]
+    current_recall = 0
+    thresholds = []
+    for i, score in enumerate(scores):
+        l_recall = (i + 1) / num_gt
+        if i < (len(scores) - 1):
+            r_recall = (i + 2) / num_gt
+        else:
+            r_recall = l_recall
+        if ((r_recall - current_recall) < (current_recall - l_recall)) and (i < (len(scores) - 1)):
+            continue
+        # recall = l_recall
+        thresholds.append(score)
+        current_recall += 1 / (num_sample_pts - 1.0)
+    return thresholds
+
+
+def clean_data(
+    gt_anno: dict,  # ground truth annotations
+    dt_anno: dict,  # detection results
+    current_class: str,  # the current class name
+    difficulty: int,  # the difficulty level
+) -> tuple:  # (num_valid_gt, ignored_gt, ignored_dt, dc_bboxes)
+    """Filter out the objects that are not in the current class.
+
+    Args:
+        gt_anno (dict): Ground truth annotations.
+        dt_anno (dict): Detection results.
+        current_class (str): The current class name.
+        difficulty (int): The difficulty level.
+
+    Returns:
+        tuple: The number of valid objects, ignored_gt, ignored_dt, and dc_bboxes.
+    """
+    MIN_HEIGHT = [40, 25, 25]
+    MAX_OCCLUSION = [0, 1, 2]
+    MAX_TRUNCATION = [0.15, 0.3, 0.5]
+    dc_bboxes, ignored_gt, ignored_dt = [], [], []
+    num_gt = len(gt_anno["name"])
+    num_dt = len(dt_anno["name"])
+    num_valid_gt = 0
+    for i in range(num_gt):
+        bbox = gt_anno["bbox"][i]
+        gt_name = gt_anno["name"][i].lower()
+        height = bbox[3] - bbox[1]
+        valid_class = -1
+        if gt_name == current_class:
+            valid_class = 1
+        elif current_class == "Pedestrian".lower() and "Person_sitting".lower() == gt_name:
+            valid_class = 0
+        elif current_class == "Car".lower() and "Van".lower() == gt_name:
+            valid_class = 0
+        else:
+            valid_class = -1
+        ignore = False
+        if (
+            (gt_anno["occluded"][i] > MAX_OCCLUSION[difficulty])
+            or (gt_anno["truncated"][i] > MAX_TRUNCATION[difficulty])
+            or (height <= MIN_HEIGHT[difficulty])
+        ):
+            # if gt_anno["difficulty"][i] > difficulty or gt_anno["difficulty"][i] == -1:
+            ignore = True
+        if valid_class == 1 and not ignore:
+            ignored_gt.append(0)
+            num_valid_gt += 1
+        elif valid_class == 0 or (ignore and (valid_class == 1)):
+            ignored_gt.append(1)
+        else:
+            ignored_gt.append(-1)
+        # for i in range(num_gt):
+        if gt_anno["name"][i] == "DontCare":
+            dc_bboxes.append(gt_anno["bbox"][i])
+    for i in range(num_dt):
+        if dt_anno["name"][i].lower() == current_class:
+            valid_class = 1
+        else:
+            valid_class = -1
+        height = abs(dt_anno["bbox"][i, 3] - dt_anno["bbox"][i, 1])
+        if height < MIN_HEIGHT[difficulty]:
+            ignored_dt.append(1)
+        elif valid_class == 1:
+            ignored_dt.append(0)
+        else:
+            ignored_dt.append(-1)
+
+    return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes
+
+
+@numba.jit(nopython=True)
+def image_box_overlap(
+    boxes: np.ndarray,  # shape: (N, 4)
+    query_boxes: np.ndarray,  # shape: (K, 4)
+    criterion: int = -1,  # default overlap criterion, -1: intersection over union, 0: intersection over box area, 1: intersection over query box area
+) -> np.ndarray:  # shape: (N, K)
+    """Args:
+        boxes (np.ndarray): shape: (N, 4), 2D boxes, (x1, y1, x2, y2)
+        query_boxes (np.ndarray): shape: (K, 4), 2D boxes, (x1, y1, x2, y2)
+        criterion (int, optional): overlap criterion, -1: intersection over union, 0: intersection over box area, 1: intersection over query box area. Defaults to -1.
+
+    Returns:
+        np.ndarray: shape: (N, K), overlap between boxes and query_boxes
+    """
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    overlaps = np.zeros((N, K), dtype=boxes.dtype)
+    for k in range(K):
+        qbox_area = (query_boxes[k, 2] - query_boxes[k, 0]) * (query_boxes[k, 3] - query_boxes[k, 1])
+        for n in range(N):
+            iw = min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0])
+            if iw > 0:
+                ih = min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1])
+                if ih > 0:
+                    if criterion == -1:
+                        ua = (boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1]) + qbox_area - iw * ih
+                    elif criterion == 0:
+                        ua = (boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1])
+                    elif criterion == 1:
+                        ua = qbox_area
+                    else:
+                        ua = 1.0
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
+
+@numba.jit(nopython=True)
+def d3_box_overlap_kernel(
+    boxes: np.ndarray,  # shape: (N, 7)
+    qboxes: np.ndarray,  # shape: (K, 7)
+    rinc: np.ndarray,  # shape: (N, K)
+    criterion: int = -1,  # default overlap criterion
+) -> None:
+    """Args:
+        boxes: Array of shape (N, 7) representing N 3D boxes.
+        qboxes: Array of shape (K, 7) representing K 3D boxes.
+        rinc: Array of shape (N, K) representing the overlap between boxes
+            and qboxes.
+        criterion: Overlap criterion. Defaults to -1. If -1, uses the
+            intersection-over-union (IoU) criterion. If 0, uses the
+            intersection-over-area1 criterion. If 1, uses the
+            intersection-over-area2 criterion.
+
+    Returns:
+        None
+    """
+    # ONLY support overlap in CAMERA, not lidar.
+    N, K = boxes.shape[0], qboxes.shape[0]
+    for i in range(N):
+        for j in range(K):
+            if rinc[i, j] > 0:
+                # iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] +
+                #         qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1]))
+                iw = min(boxes[i, 1], qboxes[j, 1]) - max(boxes[i, 1] - boxes[i, 4], qboxes[j, 1] - qboxes[j, 4])
+
+                if iw > 0:
+                    area1 = boxes[i, 3] * boxes[i, 4] * boxes[i, 5]
+                    area2 = qboxes[j, 3] * qboxes[j, 4] * qboxes[j, 5]
+                    inc = iw * rinc[i, j]
+                    if criterion == -1:
+                        ua = area1 + area2 - inc
+                    elif criterion == 0:
+                        ua = area1
+                    elif criterion == 1:
+                        ua = area2
+                    else:
+                        ua = inc
+                    rinc[i, j] = inc / ua
+                else:
+                    rinc[i, j] = 0.0
+
+
+@numba.jit(nopython=True)
+def compute_statistics_jit(
+    overlaps: np.ndarray,  # shape: (total_dt_num, total_gt_num)
+    gt_datas: np.ndarray,  # shape: (total_gt_num, 7)
+    dt_datas: np.ndarray,  # shape: (total_dt_num, 7)
+    ignored_gt: list[int],  # shape: (total_gt_num)
+    ignored_det: list[int],  # shape: (total_dt_num)
+    dc_bboxes: np.ndarray,  # shape: (total_dc_num, 4)
+    metric: int,
+    min_overlap: float,
+    thresh: float = 0,
+    compute_fp: bool = False,
+    compute_aos: bool = False,
+) -> tuple[int, int, int, float, np.ndarray]:
+    """This function computes statistics of an evaluation.
+
+    Args:
+        overlaps (np.ndarray): Overlap between dt and gt bboxes.
+        gt_datas (np.ndarray): Ground truth data.
+        dt_datas (np.ndarray): Detection data.
+        ignored_gt (List[int]): Ignore ground truth indices.
+        ignored_det (List[int]): Ignore detection indices.
+        dc_bboxes (np.ndarray): Don't care bboxes.
+        metric (int): Evaluation metric.
+        min_overlap (float): Minimum overlap between dt and gt bboxes.
+        thresh (float): Detection score threshold. Defaults to 0.
+        compute_fp (bool): Whether to compute false positives. Defaults to False.
+        compute_aos (bool): Whether to compute average orientation similarity. Defaults to False.
+
+    Returns:
+        Tuple[int, int, int, float, np.ndarray]: tp, fp, fn, similarity, thresholds
+    """
+    det_size = dt_datas.shape[0]
+    gt_size = gt_datas.shape[0]
+    dt_scores = dt_datas[:, -1]
+    dt_alphas = dt_datas[:, 4]
+    gt_alphas = gt_datas[:, 4]
+    dt_bboxes = dt_datas[:, :4]
+
+    assigned_detection = [False] * det_size
+    ignored_threshold = [False] * det_size
+    if compute_fp:
+        for i in range(det_size):
+            if dt_scores[i] < thresh:
+                ignored_threshold[i] = True
+    NO_DETECTION = -10000000
+    tp, fp, fn, similarity = 0, 0, 0, 0
+    # thresholds = [0.0]
+    # delta = [0.0]
+    thresholds = np.zeros((gt_size,))
+    thresh_idx = 0
+    delta = np.zeros((gt_size,))
+    delta_idx = 0
+    for i in range(gt_size):
+        if ignored_gt[i] == -1:
+            continue
+        det_idx = -1
+        valid_detection = NO_DETECTION
+        max_overlap = 0
+        assigned_ignored_det = False
+
+        for j in range(det_size):
+            if ignored_det[j] == -1:
+                continue
+            if assigned_detection[j]:
+                continue
+            if ignored_threshold[j]:
+                continue
+            overlap = overlaps[j, i]
+            dt_score = dt_scores[j]
+            if not compute_fp and (overlap > min_overlap) and dt_score > valid_detection:
+                det_idx = j
+                valid_detection = dt_score
+            elif (
+                compute_fp
+                and (overlap > min_overlap)
+                and (overlap > max_overlap or assigned_ignored_det)
+                and ignored_det[j] == 0
+            ):
+                max_overlap = overlap
+                det_idx = j
+                valid_detection = 1
+                assigned_ignored_det = False
+            elif compute_fp and (overlap > min_overlap) and (valid_detection == NO_DETECTION) and ignored_det[j] == 1:
+                det_idx = j
+                valid_detection = 1
+                assigned_ignored_det = True
+
+        if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0:
+            fn += 1
+        elif (valid_detection != NO_DETECTION) and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1):
+            assigned_detection[det_idx] = True
+        elif valid_detection != NO_DETECTION:
+            tp += 1
+            # thresholds.append(dt_scores[det_idx])
+            thresholds[thresh_idx] = dt_scores[det_idx]
+            thresh_idx += 1
+            if compute_aos:
+                # delta.append(gt_alphas[i] - dt_alphas[det_idx])
+                delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx]
+                delta_idx += 1
+
+            assigned_detection[det_idx] = True
+    if compute_fp:
+        for i in range(det_size):
+            if not (assigned_detection[i] or ignored_det[i] == -1 or ignored_det[i] == 1 or ignored_threshold[i]):
+                fp += 1
+        nstuff = 0
+        if metric == 0:
+            overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0)
+            for i in range(dc_bboxes.shape[0]):
+                for j in range(det_size):
+                    if assigned_detection[j]:
+                        continue
+                    if ignored_det[j] == -1 or ignored_det[j] == 1:
+                        continue
+                    if ignored_threshold[j]:
+                        continue
+                    if overlaps_dt_dc[j, i] > min_overlap:
+                        assigned_detection[j] = True
+                        nstuff += 1
+        fp -= nstuff
+        if compute_aos:
+            tmp = np.zeros((fp + delta_idx,))
+            # tmp = [0] * fp
+            for i in range(delta_idx):
+                tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0
+                # tmp.append((1.0 + np.cos(delta[i])) / 2.0)
+            # assert len(tmp) == fp + tp
+            # assert len(delta) == tp
+            if tp > 0 or fp > 0:
+                similarity = np.sum(tmp)
+            else:
+                similarity = -1
+    return tp, fp, fn, similarity, thresholds[:thresh_idx]
+
+
+@numba.jit(nopython=True)
+def get_split_parts(num: int, num_part: int) -> list[int]:
+    """Split a number into parts.
+
+    Args:
+        num (int): The number to split.
+        num_part (int): The number of parts to split into.
+
+    Returns:
+        List[int]: A list of the parts.
+    """
+    same_part = num // num_part
+    remain_num = num % num_part
+    if same_part == 0:
+        return [num]
+
+    if remain_num == 0:
+        return [same_part] * num_part
+    else:
+        return [same_part] * num_part + [remain_num]
+
+
+@numba.jit(nopython=True)
+def fused_compute_statistics(
+    overlaps: np.ndarray,  # shape: (total_dt_num, total_gt_num)
+    pr: np.ndarray,  # shape: (num_thresholds, 4)
+    gt_nums: np.ndarray,  # shape: (num_samples)
+    dt_nums: np.ndarray,  # shape: (num_samples)
+    dc_nums: np.ndarray,  # shape: (num_samples)
+    gt_datas: np.ndarray,  # shape: (total_gt_num, 7)
+    dt_datas: np.ndarray,  # shape: (total_dt_num, 7)
+    dontcares: np.ndarray,  # shape: (total_dc_num, 4)
+    ignored_gts: np.ndarray,  # shape: (total_gt_num)
+    ignored_dets: np.ndarray,  # shape: (total_dt_num)
+    metric: int,
+    min_overlap: float,
+    thresholds: np.ndarray,  # shape: (num_thresholds)
+    compute_aos: bool = False,
+) -> None:
+    """Fast compute statistics. Must be used in CAMERA coordinate system.
+
+    Args:
+    overlaps: 2D array of shape (total_dt_num, total_gt_num)
+    [dt_num, gt_num] is the overlap between dt_num-th detection
+    and gt_num-th ground truth
+    pr: 2D array of shape (num_thresholds, 4)
+    [t, 0] is the number of true positives at threshold t
+    [t, 1] is the number of false positives at threshold t
+    [t, 2] is the number of false negatives at threshold t
+    [t, 3] is the similarity at threshold t
+    gt_nums: 1D array of shape (num_samples)
+    gt_nums[i] is the number of ground truths in i-th sample
+    dt_nums: 1D array of shape (num_samples)
+    dt_nums[i] is the number of detections in i-th sample
+    dc_nums: 1D array of shape (num_samples)
+    dc_nums[i] is the number of dontcare areas in i-th sample
+    gt_datas: 2D array of shape (total_gt_num, 7)
+    gt_datas[i] is the i-th ground truth box
+    dt_datas: 2D array of shape (total_dt_num, 7)
+    dt_datas[i] is the i-th detection box
+    dontcares: 2D array of shape (total_dc_num, 4)
+    dontcares[i] is the i-th dontcare area
+    ignored_gts: 1D array of shape (total_gt_num)
+    ignored_gts[i] is 1 if the i-th ground truth is ignored, 0 otherwise
+    ignored_dets: 1D array of shape (total_dt_num)
+    ignored_dets[i] is 1 if the i-th detection is ignored, 0 otherwise
+    metric: Eval type. 0: bbox, 1: bev, 2: 3d
+    min_overlap: Min overlap
+    thresholds: 1D array of shape (num_thresholds)
+    thresholds[i] is the i-th threshold
+    compute_aos: Whether to compute aos
+    """
+    gt_num = 0
+    dt_num = 0
+    dc_num = 0
+    for i in range(gt_nums.shape[0]):
+        for t, thresh in enumerate(thresholds):
+            overlap = overlaps[dt_num : dt_num + dt_nums[i], gt_num : gt_num + gt_nums[i]]
+            gt_data = gt_datas[gt_num : gt_num + gt_nums[i]]
+            dt_data = dt_datas[dt_num : dt_num + dt_nums[i]]
+            ignored_gt = ignored_gts[gt_num : gt_num + gt_nums[i]]
+            ignored_det = ignored_dets[dt_num : dt_num + dt_nums[i]]
+            dontcare = dontcares[dc_num : dc_num + dc_nums[i]]
+            tp, fp, fn, similarity, _ = compute_statistics_jit(
+                overlap,
+                gt_data,
+                dt_data,
+                ignored_gt,
+                ignored_det,
+                dontcare,
+                metric,
+                min_overlap=min_overlap,
+                thresh=thresh,
+                compute_fp=True,
+                compute_aos=compute_aos,
+            )
+            pr[t, 0] += tp
+            pr[t, 1] += fp
+            pr[t, 2] += fn
+            if similarity != -1:
+                pr[t, 3] += similarity
+        gt_num += gt_nums[i]
+        dt_num += dt_nums[i]
+        dc_num += dc_nums[i]
+
+
+def calculate_iou_partly(
+    gt_annos: list[dict[str, Any]],
+    dt_annos: list[dict[str, Any]],
+    metric: int,
+    num_parts: int = 50,
+) -> tuple[list[np.ndarray], list[np.ndarray], np.ndarray, np.ndarray]:
+    """Fast iou algorithm. This function can be used independently to
+    do result analysis. Must be used in CAMERA coordinate system.
+
+    Args:
+        gt_annos: List of dict, must from get_label_annos() in kitti_common.py
+        dt_annos: List of dict, must from get_label_annos() in kitti_common.py
+        metric: Eval type. 0: bbox, 1: bev, 2: 3d
+        num_parts: Int, a parameter for fast calculate algorithm
+
+    Returns:
+        Tuple of
+            overlaps: List of numpy arrays, shape (num_gt, num_dt)
+            parted_overlaps: List of numpy arrays, shape (num_gt, num_dt)
+            total_gt_num: Numpy array, shape (num_images,)
+            total_dt_num: Numpy array, shape (num_images,)
+    """
+
+    def d3_box_overlap(boxes, qboxes, criterion=-1):
+        rinc = rotate_iou_eval(boxes[:, [0, 2, 3, 5, 6]], qboxes[:, [0, 2, 3, 5, 6]], 2)
+        d3_box_overlap_kernel(boxes, qboxes, rinc, criterion)
+        return rinc
+
+    assert len(gt_annos) == len(dt_annos)
+    total_dt_num = np.stack([len(a["name"]) for a in dt_annos], 0)
+    total_gt_num = np.stack([len(a["name"]) for a in gt_annos], 0)
+    num_examples = len(gt_annos)
+    split_parts = get_split_parts(num_examples, num_parts)
+    parted_overlaps = []
+    example_idx = 0
+
+    for num_part in split_parts:
+        gt_annos_part = gt_annos[example_idx : example_idx + num_part]
+        dt_annos_part = dt_annos[example_idx : example_idx + num_part]
+        if metric == 0:
+            gt_boxes = np.concatenate([a["bbox"] for a in gt_annos_part], 0)
+            dt_boxes = np.concatenate([a["bbox"] for a in dt_annos_part], 0)
+            overlap_part = image_box_overlap(gt_boxes, dt_boxes)
+        elif metric == 2:
+            loc = np.concatenate([a["location"] for a in gt_annos_part], 0)
+            dims = np.concatenate([a["dimensions"] for a in gt_annos_part], 0)
+            rots = np.concatenate([a["rotation_y"] for a in gt_annos_part], 0)
+            gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1)
+            loc = np.concatenate([a["location"] for a in dt_annos_part], 0)
+            dims = np.concatenate([a["dimensions"] for a in dt_annos_part], 0)
+            rots = np.concatenate([a["rotation_y"] for a in dt_annos_part], 0)
+            dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1)
+            overlap_part = d3_box_overlap(gt_boxes, dt_boxes).astype(np.float64)
+        else:
+            raise ValueError("unknown metric")
+        parted_overlaps.append(overlap_part)
+        example_idx += num_part
+    overlaps = []
+    example_idx = 0
+    for j, num_part in enumerate(split_parts):
+        gt_annos_part = gt_annos[example_idx : example_idx + num_part]
+        dt_annos_part = dt_annos[example_idx : example_idx + num_part]
+        gt_num_idx, dt_num_idx = 0, 0
+        for i in range(num_part):
+            gt_box_num = total_gt_num[example_idx + i]
+            dt_box_num = total_dt_num[example_idx + i]
+            overlaps.append(
+                parted_overlaps[j][gt_num_idx : gt_num_idx + gt_box_num, dt_num_idx : dt_num_idx + dt_box_num],
+            )
+            gt_num_idx += gt_box_num
+            dt_num_idx += dt_box_num
+        example_idx += num_part
+
+    return overlaps, parted_overlaps, total_gt_num, total_dt_num
+
+
+def _prepare_data(
+    gt_annos: list[dict[str, Any]],
+    dt_annos: list[dict[str, Any]],
+    current_class: str,
+    difficulty: int,
+) -> tuple[list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray], np.ndarray, int]:
+    """Prepare data for evaluation.
+
+    Args:
+        gt_annos (List[Dict[str, Any]]): Ground truth annotations.
+        dt_annos (List[Dict[str, Any]]): Detection annotations.
+        current_class (str): Current class name.
+        difficulty (int): Difficulty level.
+
+    Returns:
+        Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray], List[np.ndarray], List[np.ndarray], np.ndarray, int]:
+            gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, total_dc_num, total_num_valid_gt
+    """
+    gt_datas_list = []
+    dt_datas_list = []
+    total_dc_num = []
+    ignored_gts, ignored_dets, dontcares = [], [], []
+    total_num_valid_gt = 0
+    for i in range(len(gt_annos)):
+        rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty)
+        num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets
+        ignored_gts.append(np.array(ignored_gt, dtype=np.int64))
+        ignored_dets.append(np.array(ignored_det, dtype=np.int64))
+        if len(dc_bboxes) == 0:
+            dc_bboxes = np.zeros((0, 4)).astype(np.float64)
+        else:
+            dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64)
+        total_dc_num.append(dc_bboxes.shape[0])
+        dontcares.append(dc_bboxes)
+        total_num_valid_gt += num_valid_gt
+        gt_datas = np.concatenate([gt_annos[i]["bbox"], gt_annos[i]["alpha"][..., np.newaxis]], 1)
+        dt_datas = np.concatenate(
+            [
+                dt_annos[i]["bbox"],
+                dt_annos[i]["alpha"][..., np.newaxis],
+                dt_annos[i]["score"][..., np.newaxis],
+            ],
+            1,
+        )
+        gt_datas_list.append(gt_datas)
+        dt_datas_list.append(dt_datas)
+    total_dc_num = np.stack(total_dc_num, axis=0)
+    return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, total_dc_num, total_num_valid_gt)
+
+
+def eval_class(
+    gt_annos: list[dict[str, Any]],
+    dt_annos: list[dict[str, Any]],
+    current_classes: list[str],
+    difficultys: list[int],
+    metric: int,
+    min_overlaps: np.ndarray,
+    compute_aos: bool = False,
+    num_parts: int = 50,
+) -> dict[str, np.ndarray]:
+    """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP.
+
+    Args:
+        gt_annos: dict, must from get_label_annos() in kitti_common.py
+        dt_annos: dict, must from get_label_annos() in kitti_common.py
+        current_classes: list of label names
+        difficultys: list of int. eval difficulty, 0: easy, 1: normal, 2: hard
+        metric: eval type. 0: bbox, 1: bev, 2: 3d
+        min_overlaps: float, min overlap. format: [num_overlap, metric, class].
+        num_parts: int. a parameter for fast calculate algorithm
+
+    Returns:
+        dict of recall, precision and aos
+    """
+    assert len(gt_annos) == len(dt_annos)
+    num_examples = len(gt_annos)
+    split_parts = get_split_parts(num_examples, num_parts)
+
+    rets = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts)
+    overlaps, parted_overlaps, total_dt_num, total_gt_num = rets
+    N_SAMPLE_PTS = 41
+    num_minoverlap = len(min_overlaps)
+    num_class = len(current_classes)
+    num_difficulty = len(difficultys)
+    precision = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    recall = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    aos = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    for m, current_class in enumerate(current_classes):
+        for l, difficulty in enumerate(difficultys):
+            rets = _prepare_data(gt_annos, dt_annos, current_class, difficulty)
+            (
+                gt_datas_list,
+                dt_datas_list,
+                ignored_gts,
+                ignored_dets,
+                dontcares,
+                total_dc_num,
+                total_num_valid_gt,
+            ) = rets
+            for k, min_overlap in enumerate(min_overlaps[:, metric, m]):
+                thresholdss = []
+                for i in range(len(gt_annos)):
+                    rets = compute_statistics_jit(
+                        overlaps[i],
+                        gt_datas_list[i],
+                        dt_datas_list[i],
+                        ignored_gts[i],
+                        ignored_dets[i],
+                        dontcares[i],
+                        metric,
+                        min_overlap=min_overlap,
+                        thresh=0.0,
+                        compute_fp=False,
+                    )
+                    tp, fp, fn, similarity, thresholds = rets
+                    thresholdss += thresholds.tolist()
+                thresholdss = np.array(thresholdss)
+                thresholds = get_thresholds(thresholdss, total_num_valid_gt)
+                thresholds = np.array(thresholds)
+                pr = np.zeros([len(thresholds), 4])
+                idx = 0
+                for j, num_part in enumerate(split_parts):
+                    gt_datas_part = np.concatenate(gt_datas_list[idx : idx + num_part], 0)
+                    dt_datas_part = np.concatenate(dt_datas_list[idx : idx + num_part], 0)
+                    dc_datas_part = np.concatenate(dontcares[idx : idx + num_part], 0)
+                    ignored_dets_part = np.concatenate(ignored_dets[idx : idx + num_part], 0)
+                    ignored_gts_part = np.concatenate(ignored_gts[idx : idx + num_part], 0)
+                    fused_compute_statistics(
+                        parted_overlaps[j],
+                        pr,
+                        total_gt_num[idx : idx + num_part],
+                        total_dt_num[idx : idx + num_part],
+                        total_dc_num[idx : idx + num_part],
+                        gt_datas_part,
+                        dt_datas_part,
+                        dc_datas_part,
+                        ignored_gts_part,
+                        ignored_dets_part,
+                        metric,
+                        min_overlap=min_overlap,
+                        thresholds=thresholds,
+                        compute_aos=compute_aos,
+                    )
+                    idx += num_part
+                for i in range(len(thresholds)):
+                    recall[m, l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2])
+                    precision[m, l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 1])
+                    if compute_aos:
+                        aos[m, l, k, i] = pr[i, 3] / (pr[i, 0] + pr[i, 1])
+                for i in range(len(thresholds)):
+                    precision[m, l, k, i] = np.max(precision[m, l, k, i:], axis=-1)
+                    recall[m, l, k, i] = np.max(recall[m, l, k, i:], axis=-1)
+                    if compute_aos:
+                        aos[m, l, k, i] = np.max(aos[m, l, k, i:], axis=-1)
+    ret_dict = {
+        "recall": recall,
+        "precision": precision,
+        "orientation": aos,
+    }
+    return ret_dict
+
+
+def print_str(value, *arg, sstream=None):
+    if sstream is None:
+        sstream = sysio.StringIO()
+    sstream.truncate(0)
+    sstream.seek(0)
+    print(value, *arg, file=sstream)
+    return sstream.getvalue()
+
+
+def do_eval_cut_version(
+    gt_annos: list[dict[str, Any]],  # type hint
+    dt_annos: list[dict[str, Any]],  # type hint
+    current_classes: list[str],  # type hint
+    min_overlaps: np.ndarray,  # type hint
+    compute_aos: bool = False,  # type hint
+) -> tuple[float, float]:  # type hint
+    """Evaluates detections with COCO style AP.
+
+    Args:
+        gt_annos (List[dict]): Ground truth annotations.
+        dt_annos (List[dict]): Detection results.
+        current_classes (List[str]): Classes to evaluate.
+        min_overlaps (np.ndarray): Overlap ranges.
+        compute_aos (bool): Whether to compute aos.
+
+    Returns:
+        Tuple[float, float]: Bounding box and 3D bounding box AP.
+    """
+
+    def _get_mAP(prec: np.ndarray) -> np.ndarray:
+        sums = 0
+        for i in range(0, prec.shape[-1], 4):
+            sums = sums + prec[..., i]
+        return sums / 11 * 100
+
+    # min_overlaps: [num_minoverlap, metric, num_class]
+    difficultys = [0, 1, 2]
+    ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 0, min_overlaps, compute_aos)
+    # ret: [num_class, num_diff, num_minoverlap, num_sample_points]
+    # get 2D bbox mAP
+    mAP_bbox = _get_mAP(ret["precision"])
+
+    # get 3D bbox mAP
+    ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2, min_overlaps)
+    mAP_3d = _get_mAP(ret["precision"])
+
+    return mAP_bbox, mAP_3d
+
+
+def get_coco_eval_result(
+    gt_annos: list[dict],
+    dt_annos: list[dict],
+    current_classes: list[str],
+) -> tuple[np.ndarray, np.ndarray]:
+    """Evaluates detections with COCO style AP.
+
+    Args:
+        gt_annos (List[dict]): Ground truth annotations.
+        dt_annos (List[dict]): Detection results.
+        current_classes (List[str]): Classes to evaluate.
+
+    Returns:
+        Tuple[np.ndarray, np.ndarray]: Bounding box and 3D bounding box AP.
+    """
+
+    def do_coco_style_eval(
+        gt_annos: list[dict],
+        dt_annos: list[dict],
+        current_classes: list[str],
+        overlap_ranges: np.ndarray,
+        compute_aos: bool,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """Evaluates detections with COCO style AP.
+
+        Args:
+            gt_annos (List[dict]): Ground truth annotations.
+            dt_annos (List[dict]): Detection results.
+            current_classes (List[str]): Classes to evaluate.
+            overlap_ranges (np.ndarray): Overlap ranges.
+            compute_aos (bool): Whether to compute aos.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: Bounding box and 3D bounding box AP.
+        """
+        min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]])
+
+        for i in range(overlap_ranges.shape[1]):
+            for j in range(overlap_ranges.shape[2]):
+                min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j][:2], 10)
+
+        mAP_bbox, mAP_3d = do_eval_cut_version(gt_annos, dt_annos, current_classes, min_overlaps, compute_aos)
+
+        return mAP_bbox.mean(-1), mAP_3d.mean(-1)
+
+    iou_range = [0.5, 0.95, 10]
+    if not isinstance(current_classes, (list, tuple)):
+        current_classes = [current_classes]
+
+    overlap_ranges = np.zeros([3, 3, len(current_classes)])
+    for i, curcls in enumerate(current_classes):
+        # IoU from 0.5 to 0.95
+        overlap_ranges[:, :, i] = np.array(iou_range)[:, np.newaxis]
+    result = ""
+    # check whether alpha is valid
+    compute_aos = False
+    mAPbbox, mAP3d = do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos)
+
+    for j, curcls in enumerate(current_classes):
+        # mAP threshold array: [num_minoverlap, metric, class]
+        # mAP result: [num_class, num_diff, num_minoverlap]
+        o_range = np.array(iou_range)[[0, 2, 1]]
+        o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1)
+        result += print_str(f"{curcls} " "coco AP@{:.2f}:{:.2f}:{:.2f}:".format(*o_range))
+        result += print_str(f"bbox AP:{mAPbbox[j, 0]:.2f}, {mAPbbox[j, 1]:.2f}, {mAPbbox[j, 2]:.2f}")
+        result += print_str(f"3d   AP:{mAP3d[j, 0]:.2f}, {mAP3d[j, 1]:.2f}, {mAP3d[j, 2]:.2f}")
+
+    print("\n COCO style evaluation results: \n", result)
+
+    return mAPbbox, mAP3d
diff --git a/src/otx/core/metrics/kitti_3d_eval/rotate_gpu_iou.py b/src/otx/core/metrics/kitti_3d_eval/rotate_gpu_iou.py
new file mode 100644
index 00000000000..3aff2c4197a
--- /dev/null
+++ b/src/otx/core/metrics/kitti_3d_eval/rotate_gpu_iou.py
@@ -0,0 +1,495 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Rotate IoU for KITTI3D metric, gpu version."""
+
+import math
+
+import numba
+import numpy as np
+from numba import cuda
+
+
+@numba.jit(nopython=True)
+def div_up(m: int, n: int) -> int:
+    """Divide m by n and round up to the nearest integer.
+
+    Args:
+        m (int): Numerator.
+        n (int): Denominator.
+
+    Returns:
+        int: Result of the division rounded up to the nearest integer.
+    """
+    return m // n + (m % n > 0)
+
+
+@cuda.jit("(float32[:], float32[:], float32[:])", device=True, inline=True)
+def trangle_area(a: cuda.local.array, b: cuda.local.array, c: cuda.local.array) -> float:
+    """Calculate the area of a triangle given its three vertices.
+
+    Args:
+        a (cuda.local.array): First vertex of the triangle.
+        b (cuda.local.array): Second vertex of the triangle.
+        c (cuda.local.array): Third vertex of the triangle.
+
+    Returns:
+        float: Area of the triangle.
+    """
+    return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) * (b[0] - c[0])) / 2.0
+
+
+@cuda.jit("(float32[:], int32)", device=True, inline=True)
+def area(int_pts: cuda.local.array, num_of_inter: int) -> float:
+    """Calculate the area of a polygon using the given intersection points.
+
+    Args:
+        int_pts (cuda.local.array): Array of intersection points, shape (num_of_inter * 2,).
+        num_of_inter (int): Number of intersection points.
+
+    Returns:
+        float: The calculated area of the polygon.
+    """
+    area_val = 0.0
+    for i in range(num_of_inter - 2):
+        area_val += abs(trangle_area(int_pts[:2], int_pts[2 * i + 2 : 2 * i + 4], int_pts[2 * i + 4 : 2 * i + 6]))
+    return area_val
+
+
+@cuda.jit("(float32[:], int32)", device=True, inline=True)
+def sort_vertex_in_convex_polygon(int_pts: cuda.local.array, num_of_inter: int) -> None:
+    """Sort the vertices of a convex polygon in counterclockwise order.
+
+    Args:
+        int_pts (cuda.local.array): Array of intersection points.
+        num_of_inter (int): Number of intersection points.
+    """
+    if num_of_inter > 0:
+        center = cuda.local.array((2,), dtype=numba.float32)
+        center[:] = 0.0
+        for i in range(num_of_inter):
+            center[0] += int_pts[2 * i]
+            center[1] += int_pts[2 * i + 1]
+        center[0] /= num_of_inter
+        center[1] /= num_of_inter
+        v = cuda.local.array((2,), dtype=numba.float32)
+        vs = cuda.local.array((16,), dtype=numba.float32)
+        for i in range(num_of_inter):
+            v[0] = int_pts[2 * i] - center[0]
+            v[1] = int_pts[2 * i + 1] - center[1]
+            d = math.sqrt(v[0] * v[0] + v[1] * v[1])
+            v[0] = v[0] / d
+            v[1] = v[1] / d
+            if v[1] < 0:
+                v[0] = -2 - v[0]
+            vs[i] = v[0]
+        j = 0
+        temp = 0
+        for i in range(1, num_of_inter):
+            if vs[i - 1] > vs[i]:
+                temp = vs[i]
+                tx = int_pts[2 * i]
+                ty = int_pts[2 * i + 1]
+                j = i
+                while j > 0 and vs[j - 1] > temp:
+                    vs[j] = vs[j - 1]
+                    int_pts[j * 2] = int_pts[j * 2 - 2]
+                    int_pts[j * 2 + 1] = int_pts[j * 2 - 1]
+                    j -= 1
+
+                vs[j] = temp
+                int_pts[j * 2] = tx
+                int_pts[j * 2 + 1] = ty
+
+
+@cuda.jit("(float32[:], float32[:], int32, int32, float32[:])", device=True, inline=True)
+def line_segment_intersection(
+    pts1: cuda.local.array,  # array of points representing the first line segment
+    pts2: cuda.local.array,  # array of points representing the second line segment
+    i: int,  # index of the first line segment
+    j: int,  # index of the second line segment
+    temp_pts: cuda.local.array,  # array to store the intersection point
+) -> bool:
+    """Check if two line segments intersect and find the intersection point.
+
+    Args:
+        pts1 (cuda.local.array): Array of points representing the first line segment.
+        pts2 (cuda.local.array): Array of points representing the second line segment.
+        i (int): Index of the first line segment.
+        j (int): Index of the second line segment.
+        temp_pts (cuda.local.array): Array to store the intersection point.
+
+    Returns:
+        bool: True if the line segments intersect, False otherwise.
+    """
+    a = cuda.local.array((2,), dtype=numba.float32)
+    b = cuda.local.array((2,), dtype=numba.float32)
+    c = cuda.local.array((2,), dtype=numba.float32)
+    d = cuda.local.array((2,), dtype=numba.float32)
+
+    a[0] = pts1[2 * i]
+    a[1] = pts1[2 * i + 1]
+
+    b[0] = pts1[2 * ((i + 1) % 4)]
+    b[1] = pts1[2 * ((i + 1) % 4) + 1]
+
+    c[0] = pts2[2 * j]
+    c[1] = pts2[2 * j + 1]
+
+    d[0] = pts2[2 * ((j + 1) % 4)]
+    d[1] = pts2[2 * ((j + 1) % 4) + 1]
+    ba0 = b[0] - a[0]
+    ba1 = b[1] - a[1]
+    da0 = d[0] - a[0]
+    ca0 = c[0] - a[0]
+    da1 = d[1] - a[1]
+    ca1 = c[1] - a[1]
+    acd = da1 * ca0 > ca1 * da0
+    bcd = (d[1] - b[1]) * (c[0] - b[0]) > (c[1] - b[1]) * (d[0] - b[0])
+    if acd != bcd:
+        abc = ca1 * ba0 > ba1 * ca0
+        abd = da1 * ba0 > ba1 * da0
+        if abc != abd:
+            dc0 = d[0] - c[0]
+            dc1 = d[1] - c[1]
+            abba = a[0] * b[1] - b[0] * a[1]
+            cddc = c[0] * d[1] - d[0] * c[1]
+            dh = ba1 * dc0 - ba0 * dc1
+            dx = abba * dc0 - ba0 * cddc
+            dy = abba * dc1 - ba1 * cddc
+            temp_pts[0] = dx / dh
+            temp_pts[1] = dy / dh
+            return True
+    return False
+
+
+@cuda.jit("(float32[:], float32[:], int32, int32, float32[:])", device=True, inline=True)
+def line_segment_intersection_v1(
+    pts1: cuda.local.array,  # array of points representing the first line segment
+    pts2: cuda.local.array,  # array of points representing the second line segment
+    i: int,  # index of the first line segment
+    j: int,  # index of the second line segment
+    temp_pts: cuda.local.array,  # array to store the intersection point
+) -> bool:
+    """Check if two line segments intersect and find the intersection point using an alternative method.
+
+    Args:
+        pts1(cuda.local.array): array of points representing the first line segment
+        pts2(cuda.local.array): cuda.local.array, array of points representing the second line segment
+        i(int): int, index of the first line segment
+        j(int): int, index of the second line segment
+        temp_pts(cuda.local.array): array to store the intersection point
+
+    Returns:
+        bool: True if the line segments intersect, False otherwise
+    """
+    a = cuda.local.array((2,), dtype=numba.float32)
+    b = cuda.local.array((2,), dtype=numba.float32)
+    c = cuda.local.array((2,), dtype=numba.float32)
+    d = cuda.local.array((2,), dtype=numba.float32)
+
+    a[0] = pts1[2 * i]
+    a[1] = pts1[2 * i + 1]
+
+    b[0] = pts1[2 * ((i + 1) % 4)]
+    b[1] = pts1[2 * ((i + 1) % 4) + 1]
+
+    c[0] = pts2[2 * j]
+    c[1] = pts2[2 * j + 1]
+
+    d[0] = pts2[2 * ((j + 1) % 4)]
+    d[1] = pts2[2 * ((j + 1) % 4) + 1]
+
+    area_abc = trangle_area(a, b, c)
+    area_abd = trangle_area(a, b, d)
+
+    if area_abc * area_abd >= 0:
+        return False
+
+    area_cda = trangle_area(c, d, a)
+    area_cdb = area_cda + area_abc - area_abd
+
+    if area_cda * area_cdb >= 0:
+        return False
+    t = area_cda / (area_abd - area_abc)
+
+    dx = t * (b[0] - a[0])
+    dy = t * (b[1] - a[1])
+    temp_pts[0] = a[0] + dx
+    temp_pts[1] = a[1] + dy
+    return True
+
+
+@cuda.jit("(float32, float32, float32[:])", device=True, inline=True)
+def point_in_quadrilateral(
+    pt_x: float,  # x coordinate of the point
+    pt_y: float,  # y coordinate of the point
+    corners: cuda.local.array,  # corners of the quadrilateral, shape (8,)
+) -> bool:
+    """Check if a point is inside a quadrilateral.
+
+    Args:
+        pt_x (float): x coordinate of the point
+        pt_y (float): y coordinate of the point
+        corners (cuda.local.array): shape (8,), corners of the quadrilateral
+
+    Returns:
+        bool: True if the point is inside the quadrilateral, False otherwise
+    """
+    ab0 = corners[2] - corners[0]
+    ab1 = corners[3] - corners[1]
+
+    ad0 = corners[6] - corners[0]
+    ad1 = corners[7] - corners[1]
+
+    ap0 = pt_x - corners[0]
+    ap1 = pt_y - corners[1]
+
+    abab = ab0 * ab0 + ab1 * ab1
+    abap = ab0 * ap0 + ab1 * ap1
+    adad = ad0 * ad0 + ad1 * ad1
+    adap = ad0 * ap0 + ad1 * ap1
+
+    return abab >= abap and abap >= 0 and adad >= adap and adap >= 0
+
+
+@cuda.jit("(float32[:], float32[:], float32[:])", device=True, inline=True)
+def quadrilateral_intersection(
+    pts1: cuda.local.array,  # shape: (8,)
+    pts2: cuda.local.array,  # shape: (8,)
+    int_pts: cuda.local.array,  # shape: (16,)
+) -> int:
+    """Compute the intersection points between two quadrilaterals.
+
+    Args:
+        pts1(cuda.local.array): Array of points representing the first quadrilateral, shape (8,).
+        pts2(cuda.local.array): Array of points representing the second quadrilateral, shape (8,).
+        int_pts(cuda.local.array): Array to store the intersection points, shape (16,).
+
+    Returns:
+        int: Number of intersection points.
+    """
+    num_of_inter = 0
+    for i in range(4):
+        if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2):
+            int_pts[num_of_inter * 2] = pts1[2 * i]
+            int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1]
+            num_of_inter += 1
+        if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1):
+            int_pts[num_of_inter * 2] = pts2[2 * i]
+            int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1]
+            num_of_inter += 1
+    temp_pts = cuda.local.array((2,), dtype=numba.float32)
+    for i in range(4):
+        for j in range(4):
+            has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts)
+            if has_pts:
+                int_pts[num_of_inter * 2] = temp_pts[0]
+                int_pts[num_of_inter * 2 + 1] = temp_pts[1]
+                num_of_inter += 1
+
+    return num_of_inter
+
+
+@cuda.jit("(float32[:], float32[:])", device=True, inline=True)
+def rbbox_to_corners(
+    corners: cuda.local.array,  # shape: (8,)
+    rbbox: cuda.local.array,  # shape: (5,)
+) -> None:
+    """Convert a rotated bounding box to its corner points.
+
+    Args:
+        corners (cuda.local.array): Array to store the corner points, shape (8,).
+        rbbox (cuda.local.array): Array representing the rotated bounding box, shape (5,).
+            The rotated bounding box is represented by (center_x, center_y, width, height, angle).
+
+    Returns:
+        None
+    """
+    # generate clockwise corners and rotate it clockwise
+    angle = rbbox[4]
+    a_cos = math.cos(angle)
+    a_sin = math.sin(angle)
+    center_x = rbbox[0]
+    center_y = rbbox[1]
+    x_d = rbbox[2]
+    y_d = rbbox[3]
+    corners_x = cuda.local.array((4,), dtype=numba.float32)
+    corners_y = cuda.local.array((4,), dtype=numba.float32)
+    corners_x[0] = -x_d / 2
+    corners_x[1] = -x_d / 2
+    corners_x[2] = x_d / 2
+    corners_x[3] = x_d / 2
+    corners_y[0] = -y_d / 2
+    corners_y[1] = y_d / 2
+    corners_y[2] = y_d / 2
+    corners_y[3] = -y_d / 2
+    for i in range(4):
+        corners[2 * i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x
+        corners[2 * i + 1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y
+
+
+@cuda.jit("(float32[:], float32[:])", device=True, inline=True)
+def inter(
+    rbbox1: cuda.local.array,  # shape: (5,)
+    rbbox2: cuda.local.array,  # shape: (5,)
+) -> float:  # The intersection area of the two rotated bounding boxes.
+    """Calculate the intersection area of two rotated bounding boxes.
+
+    Args:
+        rbbox1 (ndarray): Array representing the first rotated bounding box.
+            The rotated bounding box is represented by (center_x, center_y, width, height, angle).
+        rbbox2 (ndarray): Array representing the second rotated bounding box.
+            The rotated bounding box is represented by (center_x, center_y, width, height, angle).
+
+    Returns:
+        float: The intersection area of the two rotated bounding boxes.
+    """
+    corners1 = cuda.local.array((8,), dtype=numba.float32)
+    corners2 = cuda.local.array((8,), dtype=numba.float32)
+    intersection_corners = cuda.local.array((16,), dtype=numba.float32)
+
+    rbbox_to_corners(corners1, rbbox1)
+    rbbox_to_corners(corners2, rbbox2)
+
+    num_intersection = quadrilateral_intersection(corners1, corners2, intersection_corners)
+    sort_vertex_in_convex_polygon(intersection_corners, num_intersection)
+    # print(intersection_corners.reshape([-1, 2])[:num_intersection])
+
+    return area(intersection_corners, num_intersection)
+
+
+@cuda.jit("(float32[:], float32[:], int32)", device=True, inline=True)
+def dev_rotate_iou_eval(
+    rbox1: cuda.shared.array,  # shape: (5,)
+    rbox2: cuda.shared.array,  # shape: (5,)
+    criterion: int = -1,  # IoU criterion to use. Defaults to -1.
+) -> float:  # The IoU of the two rotated bounding boxes.
+    """Calculate the IoU of two rotated bounding boxes.
+
+    Args:
+        rbox1 (cuda.shared.array): Array representing the first rotated bounding box.
+            The rotated bounding box is represented by (center_x, center_y, width, height, angle).
+        rbox2 (cuda.shared.array): Array representing the second rotated bounding box.
+            The rotated bounding box is represented by (center_x, center_y, width, height, angle).
+        criterion (int): The method to calculate the IoU.
+            -1: Calculate the IoU.
+            0: Calculate the IoU with first box as the reference.
+            1: Calculate the IoU with second box as the reference.
+
+    Returns:
+        float: The IoU of the two rotated bounding boxes.
+    """
+    area1 = rbox1[2] * rbox1[3]
+    area2 = rbox2[2] * rbox2[3]
+    area_inter = inter(rbox1, rbox2)
+    if criterion == -1:
+        return area_inter / (area1 + area2 - area_inter)
+    if criterion == 0:
+        return area_inter / area1
+    if criterion == 1:
+        return area_inter / area2
+    return area_inter
+
+
+@cuda.jit("(int64, int64, float32[:], float32[:], float32[:], int32)", fastmath=False)
+def rotate_iou_kernel_eval(
+    n: int,
+    k: int,
+    dev_boxes: cuda.shared.array,
+    dev_query_boxes: cuda.shared.array,
+    dev_iou: cuda.shared.array,
+    criterion: int = -1,
+) -> None:
+    """Calculate the IoU of two rotated bounding boxes.
+
+    Args:
+        N (int): Number of boxes.
+        K (int): Number of query boxes.
+        dev_boxes (cuda.shared.array): Array representing the boxes.
+        dev_query_boxes (cuda.shared.array): Array representing the query boxes.
+        dev_iou (cuda.shared.array): Array to store the IoU values.
+        criterion (int): The method to calculate the IoU.
+            -1: Calculate the IoU.
+            0: Calculate the IoU with the first box as the reference.
+            1: Calculate the IoU with the second box as the reference.
+
+    """
+    threads_per_block = 8 * 8
+    row_start = cuda.blockIdx.x
+    col_start = cuda.blockIdx.y
+    tx = cuda.threadIdx.x
+    row_size = min(n - row_start * threads_per_block, threads_per_block)
+    col_size = min(k - col_start * threads_per_block, threads_per_block)
+    block_boxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
+    block_qboxes = cuda.shared.array(shape=(64 * 5,), dtype=numba.float32)
+
+    dev_query_box_idx = threads_per_block * col_start + tx
+    dev_box_idx = threads_per_block * row_start + tx
+    if tx < col_size:
+        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
+        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
+        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
+        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
+        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
+    if tx < row_size:
+        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
+        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
+        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
+        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
+        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
+    cuda.syncthreads()
+    if tx < row_size:
+        for i in range(col_size):
+            offset = row_start * threads_per_block * k + col_start * threads_per_block + tx * k + i
+            dev_iou[offset] = dev_rotate_iou_eval(
+                block_qboxes[i * 5 : i * 5 + 5],
+                block_boxes[tx * 5 : tx * 5 + 5],
+                criterion,
+            )
+
+
+def rotate_iou_eval_gpu(
+    boxes: np.ndarray,  # shape: (n, 5)
+    query_boxes: np.ndarray,  # shape: (k, 5)
+    criterion: int = -1,  # IoU criterion to use. Defaults to -1.
+    device_id: int = 0,
+) -> np.ndarray:  # shape: (n, k)
+    """Compute the rotated box IoU between two sets of boxes on CPU.
+
+    Args:
+        boxes (ndarray): Array of shape (n, 5) representing n rotated boxes.
+            Each box is represented by (center_x, center_y, width, height, angle).
+        query_boxes (ndarray): Array of shape (k, 5) representing k query rotated boxes.
+            Each query box is represented by (center_x, center_y, width, height, angle).
+        criterion (int, optional): IoU criterion to use. Defaults to -1.
+
+    Returns:
+        ndarray: Array of shape (n, k) representing the IoU between each pair of boxes.
+    """
+    boxes = boxes.astype(np.float32)
+    query_boxes = query_boxes.astype(np.float32)
+    n = boxes.shape[0]
+    k = query_boxes.shape[0]
+    iou = np.zeros((n, k), dtype=np.float32)
+    if n == 0 or k == 0:
+        return iou
+    threads_per_block = 8 * 8
+    cuda.select_device(device_id)
+    blockspergrid = (div_up(n, threads_per_block), div_up(k, threads_per_block))
+
+    stream = cuda.stream()
+    with stream.auto_synchronize():
+        boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
+        query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
+        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
+        rotate_iou_kernel_eval[blockspergrid, threads_per_block, stream](
+            n,
+            k,
+            boxes_dev,
+            query_boxes_dev,
+            iou_dev,
+            criterion,
+        )
+        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
+    return iou.astype(boxes.dtype)
diff --git a/src/otx/core/metrics/kitti_3d_eval/rotate_iou.py b/src/otx/core/metrics/kitti_3d_eval/rotate_iou.py
new file mode 100644
index 00000000000..3458b6e1261
--- /dev/null
+++ b/src/otx/core/metrics/kitti_3d_eval/rotate_iou.py
@@ -0,0 +1,429 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Rotate IoU for KITTI3D metric, cpu version."""
+
+import math
+
+import numba
+import numpy as np
+
+
+@numba.jit(nopython=True)
+def div_up(m: int, n: int) -> int:
+    """Divide m by n and round up to the nearest integer.
+
+    Args:
+        m (int): Numerator.
+        n (int): Denominator.
+
+    Returns:
+        int: Result of the division rounded up to the nearest integer.
+    """
+    return m // n + (m % n > 0)
+
+
+@numba.jit(nopython=True, inline="always")
+def trangle_area(a: np.ndarray, b: np.ndarray, c: np.ndarray) -> float:
+    """Calculate the area of a triangle given its three vertices.
+
+    Args:
+        a (ndarray): First vertex of the triangle.
+        b (ndarray): Second vertex of the triangle.
+        c (ndarray): Third vertex of the triangle.
+
+    Returns:
+        float: Area of the triangle.
+    """
+    return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) * (b[0] - c[0])) / 2.0
+
+
+@numba.jit(nopython=True, inline="always")
+def area(int_pts: np.ndarray, num_of_inter: int) -> float:
+    """Calculate the area of a polygon using the given intersection points.
+
+    Args:
+        int_pts (ndarray): Array of intersection points, shape (num_of_inter * 2,).
+        num_of_inter (int): Number of intersection points.
+
+    Returns:
+        float: The calculated area of the polygon.
+    """
+    area_val: float = 0.0
+    for i in range(num_of_inter - 2):
+        area_val += abs(
+            trangle_area(
+                int_pts[:2],
+                int_pts[2 * i + 2 : 2 * i + 4],
+                int_pts[2 * i + 4 : 2 * i + 6],
+            ),
+        )
+    return area_val
+
+
+@numba.jit(nopython=True, inline="always")
+def sort_vertex_in_convex_polygon(int_pts: np.ndarray, num_of_inter: int) -> None:
+    """Sort the vertices of a convex polygon in counterclockwise order.
+
+    Args:
+        int_pts: Array of intersection points.
+        num_of_inter: Number of intersection points.
+    """
+    if num_of_inter > 0:
+        center = np.empty((2,), dtype=np.float32)
+        center[:] = 0.0
+        for i in range(num_of_inter):
+            center[0] += int_pts[2 * i]
+            center[1] += int_pts[2 * i + 1]
+        center[0] /= num_of_inter
+        center[1] /= num_of_inter
+        v = np.empty((2,), dtype=np.float32)
+        vs = np.empty((16,), dtype=np.float32)
+        for i in range(num_of_inter):
+            v[0] = int_pts[2 * i] - center[0]
+            v[1] = int_pts[2 * i + 1] - center[1]
+            d = math.sqrt(v[0] * v[0] + v[1] * v[1])
+            v[0] = v[0] / d
+            v[1] = v[1] / d
+            if v[1] < 0:
+                v[0] = -2 - v[0]
+            vs[i] = v[0]
+        j = 0
+        temp = 0
+        for i in range(1, num_of_inter):
+            if vs[i - 1] > vs[i]:
+                temp = vs[i]
+                tx = int_pts[2 * i]
+                ty = int_pts[2 * i + 1]
+                j = i
+                while j > 0 and vs[j - 1] > temp:
+                    vs[j] = vs[j - 1]
+                    int_pts[j * 2] = int_pts[j * 2 - 2]
+                    int_pts[j * 2 + 1] = int_pts[j * 2 - 1]
+                    j -= 1
+
+                vs[j] = temp
+                int_pts[j * 2] = tx
+                int_pts[j * 2 + 1] = ty
+
+
+@numba.jit(nopython=True, inline="always")
+def line_segment_intersection(
+    pts1: np.ndarray,  # array of points representing the first line segment
+    pts2: np.ndarray,  # array of points representing the second line segment
+    i: int,  # index of the first line segment
+    j: int,  # index of the second line segment
+    temp_pts: np.ndarray,  # array to store the intersection point
+) -> bool:
+    """Check if two line segments intersect and find the intersection point.
+
+    Args:
+        pts1 (ndarray): Array of points representing the first line segment.
+        pts2 (ndarray): Array of points representing the second line segment.
+        i (int): Index of the first line segment.
+        j (int): Index of the second line segment.
+        temp_pts (ndarray): Array to store the intersection point.
+
+    Returns:
+        bool: True if the line segments intersect, False otherwise.
+    """
+    a = np.empty((2,), dtype=np.float32)
+    b = np.empty((2,), dtype=np.float32)
+    c = np.empty((2,), dtype=np.float32)
+    d = np.empty((2,), dtype=np.float32)
+
+    a[0] = pts1[2 * i]
+    a[1] = pts1[2 * i + 1]
+
+    b[0] = pts1[2 * ((i + 1) % 4)]
+    b[1] = pts1[2 * ((i + 1) % 4) + 1]
+
+    c[0] = pts2[2 * j]
+    c[1] = pts2[2 * j + 1]
+
+    d[0] = pts2[2 * ((j + 1) % 4)]
+    d[1] = pts2[2 * ((j + 1) % 4) + 1]
+
+    ba0 = b[0] - a[0]
+    ba1 = b[1] - a[1]
+    da0 = d[0] - a[0]
+    ca0 = c[0] - a[0]
+    da1 = d[1] - a[1]
+    ca1 = c[1] - a[1]
+
+    acd = da1 * ca0 > ca1 * da0
+    bcd = (d[1] - b[1]) * (c[0] - b[0]) > (c[1] - b[1]) * (d[0] - b[0])
+    if acd != bcd:
+        abc = ca1 * ba0 > ba1 * ca0
+        abd = da1 * ba0 > ba1 * da0
+        if abc != abd:
+            dc0 = d[0] - c[0]
+            dc1 = d[1] - c[1]
+            abba = a[0] * b[1] - b[0] * a[1]
+            cddc = c[0] * d[1] - d[0] * c[1]
+            dh = ba1 * dc0 - ba0 * dc1
+            dx = abba * dc0 - ba0 * cddc
+            dy = abba * dc1 - ba1 * cddc
+            temp_pts[0] = dx / dh
+            temp_pts[1] = dy / dh
+            return True
+    return False
+
+
+@numba.jit(nopython=True, inline="always")
+def line_segment_intersection_v1(
+    pts1: np.ndarray,  # array of points representing the first line segment
+    pts2: np.ndarray,  # array of points representing the second line segment
+    i: int,  # index of the first line segment
+    j: int,  # index of the second line segment
+    temp_pts: np.ndarray,  # array to store the intersection point
+) -> bool:
+    """Check if two line segments intersect and find the intersection point using an alternative method.
+
+    Args:
+        pts1: ndarray, array of points representing the first line segment
+        pts2: ndarray, array of points representing the second line segment
+        i: int, index of the first line segment
+        j: int, index of the second line segment
+        temp_pts: ndarray, array to store the intersection point
+
+    Returns:
+        bool: True if the line segments intersect, False otherwise
+    """
+    a = np.empty((2,), dtype=np.float32)
+    b = np.empty((2,), dtype=np.float32)
+    c = np.empty((2,), dtype=np.float32)
+    d = np.empty((2,), dtype=np.float32)
+
+    a[0] = pts1[2 * i]
+    a[1] = pts1[2 * i + 1]
+
+    b[0] = pts1[2 * ((i + 1) % 4)]
+    b[1] = pts1[2 * ((i + 1) % 4) + 1]
+
+    c[0] = pts2[2 * j]
+    c[1] = pts2[2 * j + 1]
+
+    d[0] = pts2[2 * ((j + 1) % 4)]
+    d[1] = pts2[2 * ((j + 1) % 4) + 1]
+
+    area_abc = trangle_area(a, b, c)
+    area_abd = trangle_area(a, b, d)
+
+    if area_abc * area_abd >= 0:
+        return False
+
+    area_cda = trangle_area(c, d, a)
+    area_cdb = area_cda + area_abc - area_abd
+
+    if area_cda * area_cdb >= 0:
+        return False
+    t = area_cda / (area_abd - area_abc)
+
+    dx = t * (b[0] - a[0])
+    dy = t * (b[1] - a[1])
+    temp_pts[0] = a[0] + dx
+    temp_pts[1] = a[1] + dy
+    return True
+
+
+@numba.jit(nopython=True, inline="always")
+def point_in_quadrilateral(
+    pt_x: float,  # x coordinate of the point
+    pt_y: float,  # y coordinate of the point
+    corners: np.ndarray,  # corners of the quadrilateral, shape (8,)
+) -> bool:
+    """Check if a point is inside a quadrilateral.
+
+    Args:
+        pt_x: float, x coordinate of the point
+        pt_y: float, y coordinate of the point
+        corners: ndarray, shape (8,), corners of the quadrilateral
+
+    Returns:
+        bool: True if the point is inside the quadrilateral, False otherwise
+    """
+    ab0 = corners[2] - corners[0]
+    ab1 = corners[3] - corners[1]
+
+    ad0 = corners[6] - corners[0]
+    ad1 = corners[7] - corners[1]
+
+    ap0 = pt_x - corners[0]
+    ap1 = pt_y - corners[1]
+
+    abab = ab0 * ab0 + ab1 * ab1
+    abap = ab0 * ap0 + ab1 * ap1
+    adad = ad0 * ad0 + ad1 * ad1
+    adap = ad0 * ap0 + ad1 * ap1
+
+    return abab >= abap and abap >= 0 and adad >= adap and adap >= 0
+
+
+@numba.jit(nopython=True, inline="always")
+def quadrilateral_intersection(
+    pts1: np.ndarray,  # shape: (8,)
+    pts2: np.ndarray,  # shape: (8,)
+    int_pts: np.ndarray,  # shape: (16,)
+) -> int:
+    """Compute the intersection points between two quadrilaterals.
+
+    Args:
+        pts1: Array of points representing the first quadrilateral, shape (8,).
+        pts2: Array of points representing the second quadrilateral, shape (8,).
+        int_pts: Array to store the intersection points, shape (16,).
+
+    Returns:
+        int: Number of intersection points.
+    """
+    num_of_inter = 0
+    for i in range(4):
+        if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2):
+            int_pts[num_of_inter * 2] = pts1[2 * i]
+            int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1]
+            num_of_inter += 1
+        if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1):
+            int_pts[num_of_inter * 2] = pts2[2 * i]
+            int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1]
+            num_of_inter += 1
+    temp_pts = np.empty((2,), dtype=np.float32)
+    for i in range(4):
+        for j in range(4):
+            has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts)
+            if has_pts:
+                int_pts[num_of_inter * 2] = temp_pts[0]
+                int_pts[num_of_inter * 2 + 1] = temp_pts[1]
+                num_of_inter += 1
+
+    return num_of_inter
+
+
+@numba.jit(nopython=True, inline="always")
+def rbbox_to_corners(
+    corners: np.ndarray,  # shape: (8,)
+    rbbox: np.ndarray,  # shape: (5,)
+) -> None:
+    """Convert a rotated bounding box to its corner points.
+
+    Args:
+        corners (ndarray): Array to store the corner points, shape (8,).
+        rbbox (ndarray): Array representing the rotated bounding box, shape (5,).
+            The rotated bounding box is represented by (center_x, center_y, width, height, angle).
+
+    Returns:
+        None
+    """
+    # generate clockwise corners and rotate it clockwise
+    angle = rbbox[4]
+    a_cos = math.cos(angle)
+    a_sin = math.sin(angle)
+    center_x = rbbox[0]
+    center_y = rbbox[1]
+    x_d = rbbox[2]
+    y_d = rbbox[3]
+    corners_x = np.empty((4,), dtype=np.float32)
+    corners_y = np.empty((4,), dtype=np.float32)
+    corners_x[0] = -x_d / 2
+    corners_x[1] = -x_d / 2
+    corners_x[2] = x_d / 2
+    corners_x[3] = x_d / 2
+    corners_y[0] = -y_d / 2
+    corners_y[1] = y_d / 2
+    corners_y[2] = y_d / 2
+    corners_y[3] = -y_d / 2
+    for i in range(4):
+        corners[2 * i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x
+        corners[2 * i + 1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y
+
+
+@numba.jit(nopython=True, inline="always")
+def inter(
+    rbbox1: np.ndarray,  # shape: (5,)
+    rbbox2: np.ndarray,  # shape: (5,)
+) -> float:  # The intersection area of the two rotated bounding boxes.
+    """Calculate the intersection area of two rotated bounding boxes.
+
+    Args:
+        rbbox1 (ndarray): Array representing the first rotated bounding box.
+            The rotated bounding box is represented by (center_x, center_y, width, height, angle).
+        rbbox2 (ndarray): Array representing the second rotated bounding box.
+            The rotated bounding box is represented by (center_x, center_y, width, height, angle).
+
+    Returns:
+        float: The intersection area of the two rotated bounding boxes.
+    """
+    corners1 = np.empty((8,), dtype=np.float32)
+    corners2 = np.empty((8,), dtype=np.float32)
+    intersection_corners = np.empty((16,), dtype=np.float32)
+
+    rbbox_to_corners(corners1, rbbox1)
+    rbbox_to_corners(corners2, rbbox2)
+
+    num_intersection = quadrilateral_intersection(corners1, corners2, intersection_corners)
+    sort_vertex_in_convex_polygon(intersection_corners, num_intersection)
+
+    return area(intersection_corners, num_intersection)
+
+
+@numba.jit(nopython=True, inline="always")
+def dev_rotate_iou_eval(
+    rbox1: np.ndarray,  # shape: (5,)
+    rbox2: np.ndarray,  # shape: (5,)
+    criterion: int = -1,  # IoU criterion to use. Defaults to -1.
+) -> float:  # The IoU of the two rotated bounding boxes.
+    """Calculate the IoU of two rotated bounding boxes.
+
+    Args:
+        rbox1 (ndarray): Array representing the first rotated bounding box.
+            The rotated bounding box is represented by (center_x, center_y, width, height, angle).
+        rbox2 (ndarray): Array representing the second rotated bounding box.
+            The rotated bounding box is represented by (center_x, center_y, width, height, angle).
+        criterion (int): The method to calculate the IoU.
+            -1: Calculate the IoU.
+            0: Calculate the IoU with first box as the reference.
+            1: Calculate the IoU with second box as the reference.
+
+    Returns:
+        float: The IoU of the two rotated bounding boxes.
+    """
+    area1 = rbox1[2] * rbox1[3]
+    area2 = rbox2[2] * rbox2[3]
+    area_inter = inter(rbox1, rbox2)
+    if criterion == -1:
+        return area_inter / (area1 + area2 - area_inter)
+    if criterion == 0:
+        return area_inter / area1
+    if criterion == 1:
+        return area_inter / area2
+    return area_inter
+
+
+@numba.jit(nopython=True, inline="always")
+def rotate_iou_eval_cpu(
+    boxes: np.ndarray,  # shape: (n, 5)
+    query_boxes: np.ndarray,  # shape: (k, 5)
+    criterion: int = -1,  # IoU criterion to use. Defaults to -1.
+) -> np.ndarray:  # shape: (n, k)
+    """Compute the rotated box IoU between two sets of boxes on CPU.
+
+    Args:
+        boxes (ndarray): Array of shape (n, 5) representing n rotated boxes.
+            Each box is represented by (center_x, center_y, width, height, angle).
+        query_boxes (ndarray): Array of shape (k, 5) representing k query rotated boxes.
+            Each query box is represented by (center_x, center_y, width, height, angle).
+        criterion (int, optional): IoU criterion to use. Defaults to -1.
+
+    Returns:
+        ndarray: Array of shape (n, k) representing the IoU between each pair of boxes.
+    """
+    n = boxes.shape[0]
+    k = query_boxes.shape[0]
+    iou = np.zeros((n, k), dtype=np.float32)
+    if n == 0 or k == 0:
+        return iou
+
+    for i in range(n):
+        for j in range(k):
+            iou[i, j] = dev_rotate_iou_eval(boxes[i], query_boxes[j], criterion)
+
+    return iou
diff --git a/src/otx/core/model/detection_3d.py b/src/otx/core/model/detection_3d.py
new file mode 100644
index 00000000000..caa0d14090f
--- /dev/null
+++ b/src/otx/core/model/detection_3d.py
@@ -0,0 +1,297 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Class definition for 3d object detection model entity used in OTX."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+import torch
+from torchvision.ops import box_convert
+
+from otx.algo.utils.mmengine_utils import load_checkpoint
+from otx.core.data.dataset.utils.kitti_utils import class2angle
+from otx.core.data.entity.base import ImageInfo
+from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity, Det3DBatchPredEntity
+from otx.core.metrics import MetricInput
+from otx.core.metrics.average_precision_3d import KittiMetric
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable, OTXModel
+from otx.core.types.export import TaskLevelExportParameters
+
+if TYPE_CHECKING:
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+    from torch import nn
+
+    from otx.core.metrics import MetricCallable
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.types.label import LabelInfoTypes
+
+
+class OTX3DDetectionModel(OTXModel[Det3DBatchDataEntity, Det3DBatchPredEntity]):
+    """Base class for the 3d detection models used in OTX."""
+
+    mean: tuple[float, float, float]
+    std: tuple[float, float, float]
+    load_from: str | None
+
+    def __init__(
+        self,
+        label_info: LabelInfoTypes,
+        model_name: str,
+        input_size: tuple[int, int],
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = KittiMetric,
+        torch_compile: bool = False,
+        score_threshold: float = 0.1,
+    ) -> None:
+        """Initialize the 3d detection model."""
+        self.model_name = model_name
+        self.score_threshold = score_threshold
+        super().__init__(
+            label_info=label_info,
+            input_size=input_size,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+        )
+
+    def _create_model(self) -> nn.Module:
+        """Creates the model."""
+        detector = self._build_model(num_classes=self.label_info.num_classes)
+        if hasattr(detector, "init_weights"):
+            detector.init_weights()
+        self.classification_layers = self.get_classification_layers(prefix="model.")
+        if self.load_from is not None:
+            load_checkpoint(detector, self.load_from, map_location="cpu")
+        return detector
+
+    @property
+    def _export_parameters(self) -> TaskLevelExportParameters:
+        """Defines parameters required to export a particular model implementation."""
+        return super()._export_parameters.wrap(
+            model_type="ssd",
+            task_type="detection",
+        )
+
+    def _convert_pred_entity_to_compute_metric(
+        self,
+        preds: Det3DBatchPredEntity,
+        inputs: Det3DBatchDataEntity,
+    ) -> MetricInput:
+        """Converts the prediction entity to the format required for computing metrics.
+
+        Args:
+            preds (Det3DBatchPredEntity): Prediction entity.
+            inputs (Det3DBatchDataEntity): Input data entity.
+        """
+        boxes = preds.boxes_3d
+        # bbox 2d decoding
+        xywh_2d = box_convert(preds.boxes, "xyxy", "cxcywh")
+
+        xs3d = boxes[:, :, 0:1]
+        ys3d = boxes[:, :, 1:2]
+        xs2d = xywh_2d[:, :, 0:1]
+        ys2d = xywh_2d[:, :, 1:2]
+
+        batch = len(boxes)
+        labels = preds.labels.view(batch, -1, 1)
+        scores = preds.scores.view(batch, -1, 1)
+        xs2d = xs2d.view(batch, -1, 1)
+        ys2d = ys2d.view(batch, -1, 1)
+        xs3d = xs3d.view(batch, -1, 1)
+        ys3d = ys3d.view(batch, -1, 1)
+
+        detections = (
+            torch.cat(
+                [
+                    labels,
+                    scores,
+                    xs2d,
+                    ys2d,
+                    preds.size_2d,
+                    preds.depth[:, :, 0:1],
+                    preds.heading_angle,
+                    preds.size_3d,
+                    xs3d,
+                    ys3d,
+                    torch.exp(-preds.depth[:, :, 1:2]),
+                ],
+                dim=2,
+            )
+            .detach()
+            .cpu()
+            .numpy()
+        )
+
+        img_sizes = np.array([img_info.ori_shape for img_info in inputs.imgs_info])
+        calib_matrix = [p2.detach().cpu().numpy() for p2 in inputs.calib_matrix]
+        result_list = self._decode_detections_for_kitti_format(
+            detections,
+            img_sizes,
+            calib_matrix,
+            class_names=self.label_info.label_names,
+            threshold=self.score_threshold,
+        )
+
+        return {
+            "preds": result_list,
+            "target": inputs.original_kitti_format,  # type: ignore[dict-item]
+        }
+
+    @staticmethod
+    def _decode_detections_for_kitti_format(
+        dets: np.ndarray,
+        img_size: np.ndarray,
+        calib_matrix: list[np.ndarray],
+        class_names: list[str],
+        threshold: float = 0.2,
+    ) -> list[dict[str, np.ndarray]]:
+        """Decode the detection results for KITTI format."""
+
+        def _get_heading_angle(heading: np.ndarray) -> np.ndarray:
+            """Get heading angle from the prediction."""
+            heading_bin, heading_res = heading[0:12], heading[12:24]
+            cls = np.argmax(heading_bin)
+            res = heading_res[cls]
+            return class2angle(cls, res, to_label_format=True)
+
+        def _alpha2ry(calib_matrix: np.ndarray, alpha: np.ndarray, u: np.ndarray) -> np.ndarray:
+            """Get rotation_y by alpha + theta - 180."""
+            cu = calib_matrix[0, 2]
+            fu = calib_matrix[0, 0]
+
+            ry = alpha + np.arctan2(u - cu, fu)
+
+            if ry > np.pi:
+                ry -= 2 * np.pi
+            if ry < -np.pi:
+                ry += 2 * np.pi
+
+            return ry
+
+        def _img_to_rect(calib_matrix: np.ndarray, u: np.ndarray, v: np.ndarray, depth_rect: np.ndarray) -> np.ndarray:
+            """Transform image coordinates to the rectangle coordinates."""
+            cu = calib_matrix[0, 2]
+            cv = calib_matrix[1, 2]
+            fu = calib_matrix[0, 0]
+            fv = calib_matrix[1, 1]
+            tx = calib_matrix[0, 3] / (-fu)
+            ty = calib_matrix[1, 3] / (-fv)
+
+            x = ((u - cu) * depth_rect) / fu + tx
+            y = ((v - cv) * depth_rect) / fv + ty
+            return np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1), depth_rect.reshape(-1, 1)), axis=1)
+
+        results = []
+        for i in range(dets.shape[0]):  # batch
+            names = []
+            alphas = []
+            bboxes = []
+            dimensions = []
+            locations = []
+            rotation_y = []
+            scores = []
+
+            for j in range(dets.shape[1]):  # max_dets
+                cls_id = int(dets[i, j, 0])
+                score = dets[i, j, 1]
+                if score < threshold:
+                    continue
+
+                # 2d bboxs decoding
+                x = dets[i, j, 2] * img_size[i][0]
+                y = dets[i, j, 3] * img_size[i][1]
+                w = dets[i, j, 4] * img_size[i][0]
+                h = dets[i, j, 5] * img_size[i][1]
+                bbox = [x - w / 2, y - h / 2, x + w / 2, y + h / 2]
+
+                # 3d bboxs decoding
+                # depth decoding
+                depth = dets[i, j, 6]
+
+                # dimensions decoding
+                dimension = dets[i, j, 31:34]
+
+                # positions decoding
+                x3d = dets[i, j, 34] * img_size[i][0]
+                y3d = dets[i, j, 35] * img_size[i][1]
+                location = _img_to_rect(calib_matrix[i], x3d, y3d, depth).reshape(-1)
+                location[1] += dimension[0] / 2
+
+                # heading angle decoding
+                alpha = dets[i, j, 7:31]
+                alpha = _get_heading_angle(dets[i, j, 7:31])
+                ry = _alpha2ry(calib_matrix[i], alpha, x)
+
+                names.append(class_names[cls_id])
+                alphas.append(alpha)
+                bboxes.append(bbox)
+                dimensions.append(np.array([dimension[2], dimension[0], dimension[1]]))
+                locations.append(location)
+                rotation_y.append(ry)
+                scores.append(score)
+
+            results.append(
+                {
+                    "name": np.array(names),
+                    "alpha": np.array(alphas),
+                    "bbox": np.array(bboxes).reshape(-1, 4),
+                    "dimensions": np.array(dimensions).reshape(-1, 3),
+                    "location": np.array(locations).reshape(-1, 3),
+                    "rotation_y": np.array(rotation_y),
+                    "score": np.array(scores),
+                },
+            )
+
+        return results
+
+    def get_dummy_input(self, batch_size: int = 1) -> Det3DBatchDataEntity:
+        """Returns a dummy input for 3d object detection model."""
+        if self.input_size is None:
+            msg = f"Input size attribute is not set for {self.__class__}"
+            raise ValueError(msg)
+
+        images = [torch.rand(3, *self.input_size) for _ in range(batch_size)]
+        calib_matrix = [torch.rand(3, 4) for _ in range(batch_size)]
+        infos = []
+        for i, img in enumerate(images):
+            infos.append(
+                ImageInfo(
+                    img_idx=i,
+                    img_shape=img.shape,
+                    ori_shape=img.shape,
+                ),
+            )
+        return Det3DBatchDataEntity(
+            batch_size,
+            images,
+            infos,
+            boxes=[],
+            labels=[],
+            calib_matrix=calib_matrix,
+            boxes_3d=[],
+            size_2d=[],
+            size_3d=[],
+            depth=[],
+            heading_angle=[],
+            original_kitti_format=[],
+        )
+
+    def get_classification_layers(self, prefix: str = "model.") -> dict[str, dict[str, int]]:
+        """Get final classification layer information for incremental learning case."""
+        sample_model_dict = self._build_model(num_classes=5).state_dict()
+        incremental_model_dict = self._build_model(num_classes=6).state_dict()
+
+        classification_layers = {}
+        for key in sample_model_dict:
+            if sample_model_dict[key].shape != incremental_model_dict[key].shape:
+                sample_model_dim = sample_model_dict[key].shape[0]
+                incremental_model_dim = incremental_model_dict[key].shape[0]
+                stride = incremental_model_dim - sample_model_dim
+                num_extra_classes = 6 * sample_model_dim - 5 * incremental_model_dim
+                classification_layers[prefix + key] = {"stride": stride, "num_extra_classes": num_extra_classes}
+        return classification_layers
diff --git a/src/otx/core/types/task.py b/src/otx/core/types/task.py
index ed5b893020a..ddfc10f33ab 100644
--- a/src/otx/core/types/task.py
+++ b/src/otx/core/types/task.py
@@ -31,6 +31,7 @@ class OTXTaskType(str, Enum):
     ROTATED_DETECTION = "ROTATED_DETECTION"
     DETECTION_SEMI_SL = "DETECTION_SEMI_SL"
     KEYPOINT_DETECTION = "KEYPOINT_DETECTION"
+    OBJECT_DETECTION_3D = "OBJECT_DETECTION_3D"
 
     # Segmentation
     INSTANCE_SEGMENTATION = "INSTANCE_SEGMENTATION"
diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py
index 16fb530610c..79459b66f9a 100644
--- a/src/otx/engine/utils/auto_configurator.py
+++ b/src/otx/engine/utils/auto_configurator.py
@@ -48,6 +48,7 @@
     OTXTaskType.VISUAL_PROMPTING: RECIPE_PATH / "visual_prompting" / "sam_tiny_vit.yaml",
     OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING: RECIPE_PATH / "zero_shot_visual_prompting" / "sam_tiny_vit.yaml",
     OTXTaskType.KEYPOINT_DETECTION: RECIPE_PATH / "keypoint_detection" / "rtmpose_tiny.yaml",
+    OTXTaskType.OBJECT_DETECTION_3D: RECIPE_PATH / "object_detection_3d" / "monodetr3d.yaml",
 }
 
 TASK_PER_DATA_FORMAT = {
diff --git a/src/otx/recipe/_base_/data/object_detection_3d.yaml b/src/otx/recipe/_base_/data/object_detection_3d.yaml
new file mode 100644
index 00000000000..a7c773f1bcf
--- /dev/null
+++ b/src/otx/recipe/_base_/data/object_detection_3d.yaml
@@ -0,0 +1,52 @@
+task: OBJECT_DETECTION_3D
+input_size:
+  - 384
+  - 1280
+mem_cache_size: 1GB
+mem_cache_img_max_size: null
+image_color_channel: RGB
+stack_images: true
+data_format: kitti3d
+unannotated_items_ratio: 0.0
+train_subset:
+  subset_name: train
+  transform_lib_type: TORCHVISION
+  batch_size: 8
+  num_workers: 4
+  to_tv_image: false
+  transforms:
+    - class_path: torchvision.transforms.v2.Normalize
+      init_args:
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+
+  sampler:
+    class_path: torch.utils.data.RandomSampler
+
+val_subset:
+  subset_name: val
+  transform_lib_type: TORCHVISION
+  batch_size: 16
+  num_workers: 4
+  to_tv_image: false
+  transforms:
+    - class_path: torchvision.transforms.v2.Normalize
+      init_args:
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+  sampler:
+    class_path: torch.utils.data.RandomSampler
+
+test_subset:
+  subset_name: test
+  transform_lib_type: TORCHVISION
+  batch_size: 16
+  num_workers: 4
+  to_tv_image: false
+  transforms:
+    - class_path: torchvision.transforms.v2.Normalize
+      init_args:
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+  sampler:
+    class_path: torch.utils.data.RandomSampler
diff --git a/src/otx/recipe/object_detection_3d/monodetr3d.yaml b/src/otx/recipe/object_detection_3d/monodetr3d.yaml
new file mode 100644
index 00000000000..032c71ffbf8
--- /dev/null
+++ b/src/otx/recipe/object_detection_3d/monodetr3d.yaml
@@ -0,0 +1,44 @@
+model:
+  class_path: otx.algo.object_detection_3d.monodetr3d.MonoDETR3D
+  init_args:
+    label_info: 17
+    model_name: monodetr_50
+    input_size:
+      - 384
+      - 1280
+
+    optimizer:
+      class_path: torch.optim.AdamW
+      init_args:
+        lr: 0.0001
+        betas: [0.9, 0.999]
+        weight_decay: 0.0001
+
+    scheduler:
+      class_path: lightning.pytorch.cli.ReduceLROnPlateau
+      init_args:
+        mode: max
+        factor: 0.1
+        patience: 13
+        monitor: val/mAP_bbox_2d
+
+engine:
+  task: OBJECT_DETECTION_3D
+  device: auto
+
+callback_monitor: val/mAP_bbox_3d
+
+data: ../_base_/data/object_detection_3d.yaml
+
+precision: 32 # MonoDETR do not support fp16 training
+overrides:
+  callbacks:
+    - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
+      init_args:
+        monitor: null
+        mode: max
+        patience: 15
+        check_on_train_epoch_end: false
+        min_delta: 0.001
+        warmup_iters: 30
+        warmup_epochs: 30
diff --git a/tests/assets/kitti3d/calib/test/000023.txt b/tests/assets/kitti3d/calib/test/000023.txt
new file mode 100755
index 00000000000..f8a223dbf17
--- /dev/null
+++ b/tests/assets/kitti3d/calib/test/000023.txt
@@ -0,0 +1,8 @@
+P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03
+P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03
+R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01
+Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01
+Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01
+
diff --git a/tests/assets/kitti3d/calib/test/000025.txt b/tests/assets/kitti3d/calib/test/000025.txt
new file mode 100755
index 00000000000..f8a223dbf17
--- /dev/null
+++ b/tests/assets/kitti3d/calib/test/000025.txt
@@ -0,0 +1,8 @@
+P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03
+P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03
+R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01
+Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01
+Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01
+
diff --git a/tests/assets/kitti3d/calib/test/000037.txt b/tests/assets/kitti3d/calib/test/000037.txt
new file mode 100755
index 00000000000..dd653b08224
--- /dev/null
+++ b/tests/assets/kitti3d/calib/test/000037.txt
@@ -0,0 +1,8 @@
+P0: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 0.000000000000e+00 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P1: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 -3.861448000000e+02 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P2: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 4.538225000000e+01 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 -1.130887000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 3.779761000000e-03
+P3: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 -3.372877000000e+02 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 2.369057000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 4.915215000000e-03
+R0_rect: 9.999454000000e-01 7.259129000000e-03 -7.519551000000e-03 -7.292213000000e-03 9.999638000000e-01 -4.381729000000e-03 7.487471000000e-03 4.436324000000e-03 9.999621000000e-01
+Tr_velo_to_cam: 7.967514000000e-03 -9.999679000000e-01 -8.462264000000e-04 -1.377769000000e-02 -2.771053000000e-03 8.241710000000e-04 -9.999958000000e-01 -5.542117000000e-02 9.999644000000e-01 7.969825000000e-03 -2.764397000000e-03 -2.918589000000e-01
+Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01
+
diff --git a/tests/assets/kitti3d/calib/train/000003.txt b/tests/assets/kitti3d/calib/train/000003.txt
new file mode 100755
index 00000000000..f8a223dbf17
--- /dev/null
+++ b/tests/assets/kitti3d/calib/train/000003.txt
@@ -0,0 +1,8 @@
+P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03
+P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03
+R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01
+Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01
+Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01
+
diff --git a/tests/assets/kitti3d/calib/train/000011.txt b/tests/assets/kitti3d/calib/train/000011.txt
new file mode 100755
index 00000000000..f8a223dbf17
--- /dev/null
+++ b/tests/assets/kitti3d/calib/train/000011.txt
@@ -0,0 +1,8 @@
+P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03
+P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03
+R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01
+Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01
+Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01
+
diff --git a/tests/assets/kitti3d/calib/train/000036.txt b/tests/assets/kitti3d/calib/train/000036.txt
new file mode 100755
index 00000000000..f8a223dbf17
--- /dev/null
+++ b/tests/assets/kitti3d/calib/train/000036.txt
@@ -0,0 +1,8 @@
+P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03
+P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03
+R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01
+Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01
+Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01
+
diff --git a/tests/assets/kitti3d/calib/train/000046.txt b/tests/assets/kitti3d/calib/train/000046.txt
new file mode 100755
index 00000000000..f8a223dbf17
--- /dev/null
+++ b/tests/assets/kitti3d/calib/train/000046.txt
@@ -0,0 +1,8 @@
+P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03
+P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03
+R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01
+Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01
+Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01
+
diff --git a/tests/assets/kitti3d/calib/train/000055.txt b/tests/assets/kitti3d/calib/train/000055.txt
new file mode 100755
index 00000000000..f8a223dbf17
--- /dev/null
+++ b/tests/assets/kitti3d/calib/train/000055.txt
@@ -0,0 +1,8 @@
+P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03
+P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03
+R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01
+Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01
+Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01
+
diff --git a/tests/assets/kitti3d/calib/val/000023.txt b/tests/assets/kitti3d/calib/val/000023.txt
new file mode 100755
index 00000000000..f8a223dbf17
--- /dev/null
+++ b/tests/assets/kitti3d/calib/val/000023.txt
@@ -0,0 +1,8 @@
+P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03
+P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03
+R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01
+Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01
+Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01
+
diff --git a/tests/assets/kitti3d/calib/val/000025.txt b/tests/assets/kitti3d/calib/val/000025.txt
new file mode 100755
index 00000000000..f8a223dbf17
--- /dev/null
+++ b/tests/assets/kitti3d/calib/val/000025.txt
@@ -0,0 +1,8 @@
+P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03
+P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03
+R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01
+Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01
+Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01
+
diff --git a/tests/assets/kitti3d/calib/val/000037.txt b/tests/assets/kitti3d/calib/val/000037.txt
new file mode 100755
index 00000000000..dd653b08224
--- /dev/null
+++ b/tests/assets/kitti3d/calib/val/000037.txt
@@ -0,0 +1,8 @@
+P0: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 0.000000000000e+00 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P1: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 -3.861448000000e+02 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P2: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 4.538225000000e+01 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 -1.130887000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 3.779761000000e-03
+P3: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 -3.372877000000e+02 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 2.369057000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 4.915215000000e-03
+R0_rect: 9.999454000000e-01 7.259129000000e-03 -7.519551000000e-03 -7.292213000000e-03 9.999638000000e-01 -4.381729000000e-03 7.487471000000e-03 4.436324000000e-03 9.999621000000e-01
+Tr_velo_to_cam: 7.967514000000e-03 -9.999679000000e-01 -8.462264000000e-04 -1.377769000000e-02 -2.771053000000e-03 8.241710000000e-04 -9.999958000000e-01 -5.542117000000e-02 9.999644000000e-01 7.969825000000e-03 -2.764397000000e-03 -2.918589000000e-01
+Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01
+
diff --git a/tests/assets/kitti3d/image_2/test/000023.png b/tests/assets/kitti3d/image_2/test/000023.png
new file mode 100755
index 00000000000..416119c3ba8
Binary files /dev/null and b/tests/assets/kitti3d/image_2/test/000023.png differ
diff --git a/tests/assets/kitti3d/image_2/test/000025.png b/tests/assets/kitti3d/image_2/test/000025.png
new file mode 100755
index 00000000000..b9003e24f2f
Binary files /dev/null and b/tests/assets/kitti3d/image_2/test/000025.png differ
diff --git a/tests/assets/kitti3d/image_2/test/000037.png b/tests/assets/kitti3d/image_2/test/000037.png
new file mode 100755
index 00000000000..deb48e43602
Binary files /dev/null and b/tests/assets/kitti3d/image_2/test/000037.png differ
diff --git a/tests/assets/kitti3d/image_2/train/000003.png b/tests/assets/kitti3d/image_2/train/000003.png
new file mode 100755
index 00000000000..206f5703776
Binary files /dev/null and b/tests/assets/kitti3d/image_2/train/000003.png differ
diff --git a/tests/assets/kitti3d/image_2/train/000011.png b/tests/assets/kitti3d/image_2/train/000011.png
new file mode 100755
index 00000000000..2afc9262872
Binary files /dev/null and b/tests/assets/kitti3d/image_2/train/000011.png differ
diff --git a/tests/assets/kitti3d/image_2/train/000036.png b/tests/assets/kitti3d/image_2/train/000036.png
new file mode 100755
index 00000000000..0b76bea0c4f
Binary files /dev/null and b/tests/assets/kitti3d/image_2/train/000036.png differ
diff --git a/tests/assets/kitti3d/image_2/train/000046.png b/tests/assets/kitti3d/image_2/train/000046.png
new file mode 100755
index 00000000000..c61025ab22f
Binary files /dev/null and b/tests/assets/kitti3d/image_2/train/000046.png differ
diff --git a/tests/assets/kitti3d/image_2/train/000055.png b/tests/assets/kitti3d/image_2/train/000055.png
new file mode 100755
index 00000000000..512d9092818
Binary files /dev/null and b/tests/assets/kitti3d/image_2/train/000055.png differ
diff --git a/tests/assets/kitti3d/image_2/val/000023.png b/tests/assets/kitti3d/image_2/val/000023.png
new file mode 100755
index 00000000000..416119c3ba8
Binary files /dev/null and b/tests/assets/kitti3d/image_2/val/000023.png differ
diff --git a/tests/assets/kitti3d/image_2/val/000025.png b/tests/assets/kitti3d/image_2/val/000025.png
new file mode 100755
index 00000000000..b9003e24f2f
Binary files /dev/null and b/tests/assets/kitti3d/image_2/val/000025.png differ
diff --git a/tests/assets/kitti3d/image_2/val/000037.png b/tests/assets/kitti3d/image_2/val/000037.png
new file mode 100755
index 00000000000..deb48e43602
Binary files /dev/null and b/tests/assets/kitti3d/image_2/val/000037.png differ
diff --git a/tests/assets/kitti3d/label_2/test/000023.txt b/tests/assets/kitti3d/label_2/test/000023.txt
new file mode 100644
index 00000000000..515e01f2abf
--- /dev/null
+++ b/tests/assets/kitti3d/label_2/test/000023.txt
@@ -0,0 +1 @@
+Car 0.00 0 1.86 372.95 182.64 412.21 205.68 1.67 1.87 3.69 -16.57 2.43 55.08 1.57
diff --git a/tests/assets/kitti3d/label_2/test/000025.txt b/tests/assets/kitti3d/label_2/test/000025.txt
new file mode 100644
index 00000000000..70b2887dd78
--- /dev/null
+++ b/tests/assets/kitti3d/label_2/test/000025.txt
@@ -0,0 +1,5 @@
+Car 0.94 3 -2.10 896.11 218.17 1241.00 374.00 1.39 1.44 3.08 2.43 1.68 3.14 -1.49
+Car 0.00 0 -1.29 351.84 183.19 537.77 308.64 1.47 1.60 3.66 -2.21 1.63 10.42 -1.49
+Car 0.00 0 1.75 562.48 173.46 618.49 217.36 1.70 1.63 4.08 -0.78 1.75 30.18 1.72
+Car 0.00 0 -1.69 724.21 178.91 805.39 249.94 1.59 1.59 2.47 3.64 1.75 17.48 -1.49
+Car 0.00 1 -1.62 720.81 187.01 779.98 236.22 1.37 1.59 3.22 4.23 1.83 22.30 -1.44
diff --git a/tests/assets/kitti3d/label_2/test/000037.txt b/tests/assets/kitti3d/label_2/test/000037.txt
new file mode 100644
index 00000000000..49043194d9f
--- /dev/null
+++ b/tests/assets/kitti3d/label_2/test/000037.txt
@@ -0,0 +1,2 @@
+Car 0.00 0 -1.57 555.85 173.64 628.69 240.05 1.60 1.76 3.84 -0.45 1.32 19.43 -1.59
+Car 0.00 2 -1.46 473.07 183.18 535.73 226.04 1.30 1.61 4.39 -3.46 1.24 24.49 -1.60
diff --git a/tests/assets/kitti3d/label_2/train/000003.txt b/tests/assets/kitti3d/label_2/train/000003.txt
new file mode 100644
index 00000000000..58ea56c4482
--- /dev/null
+++ b/tests/assets/kitti3d/label_2/train/000003.txt
@@ -0,0 +1 @@
+Car 0.00 0 1.55 614.24 181.78 727.31 284.77 1.57 1.73 4.15 1.00 1.75 13.22 1.62
diff --git a/tests/assets/kitti3d/label_2/train/000011.txt b/tests/assets/kitti3d/label_2/train/000011.txt
new file mode 100644
index 00000000000..a9c93631daa
--- /dev/null
+++ b/tests/assets/kitti3d/label_2/train/000011.txt
@@ -0,0 +1,2 @@
+Car 0.00 0 1.74 444.29 171.04 504.95 225.82 1.86 1.57 3.83 -4.95 1.83 26.64 1.55
+Car 0.98 0 2.42 0.00 217.12 85.92 374.00 1.50 1.46 3.70 -5.12 1.85 4.13 1.56
diff --git a/tests/assets/kitti3d/label_2/train/000036.txt b/tests/assets/kitti3d/label_2/train/000036.txt
new file mode 100644
index 00000000000..267b6774cc2
--- /dev/null
+++ b/tests/assets/kitti3d/label_2/train/000036.txt
@@ -0,0 +1,7 @@
+Car 0.00 0 -1.58 553.16 178.73 693.67 311.88 1.55 1.63 3.32 0.11 1.64 10.13 -1.57
+Car 0.00 0 1.89 341.05 194.66 390.28 218.73 1.37 1.82 3.96 -15.54 2.81 46.01 1.56
+Car 0.00 0 2.03 286.23 196.48 337.13 220.85 1.38 1.79 2.94 -18.46 2.88 44.69 1.64
+Car 0.00 0 1.82 412.76 190.23 455.28 212.54 1.40 1.75 4.75 -12.25 2.66 50.40 1.58
+Car 0.00 1 2.67 960.19 183.87 1104.42 229.69 1.59 1.73 4.26 15.49 2.03 26.68 -3.09
+Car 0.39 3 2.53 1154.43 178.14 1241.00 222.29 1.55 1.76 3.59 22.52 1.76 26.55 -3.06
+Car 0.97 3 -2.06 1123.12 213.78 1241.00 374.00 1.30 1.64 3.71 4.52 1.63 4.08 -1.26
diff --git a/tests/assets/kitti3d/label_2/train/000046.txt b/tests/assets/kitti3d/label_2/train/000046.txt
new file mode 100644
index 00000000000..59ccea26494
--- /dev/null
+++ b/tests/assets/kitti3d/label_2/train/000046.txt
@@ -0,0 +1,6 @@
+Car 0.00 0 -1.56 578.47 176.02 625.98 221.24 1.54 1.63 3.67 -0.32 1.68 26.76 -1.58
+Car 0.00 2 -1.58 597.59 168.75 634.85 207.61 1.77 1.69 4.32 0.26 1.61 35.33 -1.57
+Car 0.02 0 -1.89 769.21 151.69 1087.61 374.00 1.74 1.62 4.13 2.87 1.61 7.64 -1.54
+Car 0.00 1 -1.77 706.06 174.92 807.35 254.28 1.53 1.46 3.26 3.03 1.59 15.65 -1.59
+Car 0.00 0 -1.67 660.18 171.93 703.11 210.02 1.60 1.56 3.45 3.13 1.59 32.30 -1.57
+Car 0.00 2 -1.68 646.40 174.21 684.26 204.82 1.52 1.53 4.07 2.85 1.62 38.20 -1.61
diff --git a/tests/assets/kitti3d/label_2/train/000055.txt b/tests/assets/kitti3d/label_2/train/000055.txt
new file mode 100644
index 00000000000..80b020cac95
--- /dev/null
+++ b/tests/assets/kitti3d/label_2/train/000055.txt
@@ -0,0 +1,4 @@
+Car 0.00 1 2.24 31.61 193.32 129.71 230.77 1.48 1.35 3.93 -23.47 2.44 32.12 1.62
+Car 0.00 1 2.14 104.42 188.01 188.34 220.56 1.50 1.62 4.08 -23.45 2.30 36.62 1.58
+Car 0.00 1 2.08 183.23 187.33 245.86 217.54 1.65 1.57 3.82 -23.27 2.53 42.56 1.58
+Car 0.00 1 2.04 227.36 189.17 281.98 211.96 1.37 1.36 4.44 -23.40 2.48 47.64 1.59
diff --git a/tests/assets/kitti3d/label_2/val/000023.txt b/tests/assets/kitti3d/label_2/val/000023.txt
new file mode 100644
index 00000000000..515e01f2abf
--- /dev/null
+++ b/tests/assets/kitti3d/label_2/val/000023.txt
@@ -0,0 +1 @@
+Car 0.00 0 1.86 372.95 182.64 412.21 205.68 1.67 1.87 3.69 -16.57 2.43 55.08 1.57
diff --git a/tests/assets/kitti3d/label_2/val/000025.txt b/tests/assets/kitti3d/label_2/val/000025.txt
new file mode 100644
index 00000000000..70b2887dd78
--- /dev/null
+++ b/tests/assets/kitti3d/label_2/val/000025.txt
@@ -0,0 +1,5 @@
+Car 0.94 3 -2.10 896.11 218.17 1241.00 374.00 1.39 1.44 3.08 2.43 1.68 3.14 -1.49
+Car 0.00 0 -1.29 351.84 183.19 537.77 308.64 1.47 1.60 3.66 -2.21 1.63 10.42 -1.49
+Car 0.00 0 1.75 562.48 173.46 618.49 217.36 1.70 1.63 4.08 -0.78 1.75 30.18 1.72
+Car 0.00 0 -1.69 724.21 178.91 805.39 249.94 1.59 1.59 2.47 3.64 1.75 17.48 -1.49
+Car 0.00 1 -1.62 720.81 187.01 779.98 236.22 1.37 1.59 3.22 4.23 1.83 22.30 -1.44
diff --git a/tests/assets/kitti3d/label_2/val/000037.txt b/tests/assets/kitti3d/label_2/val/000037.txt
new file mode 100644
index 00000000000..49043194d9f
--- /dev/null
+++ b/tests/assets/kitti3d/label_2/val/000037.txt
@@ -0,0 +1,2 @@
+Car 0.00 0 -1.57 555.85 173.64 628.69 240.05 1.60 1.76 3.84 -0.45 1.32 19.43 -1.59
+Car 0.00 2 -1.46 473.07 183.18 535.73 226.04 1.30 1.61 4.39 -3.46 1.24 24.49 -1.60
diff --git a/tests/integration/cli/test_cli.py b/tests/integration/cli/test_cli.py
index 7c3c9d2a959..f571dc2ed2c 100644
--- a/tests/integration/cli/test_cli.py
+++ b/tests/integration/cli/test_cli.py
@@ -195,6 +195,10 @@ def test_otx_e2e(
         print("Inference and explain are not supported for keypoint detection")
         return
 
+    if "monodetr3d" in recipe:
+        print("Inference and explain are not supported for object detection 3d")
+        return
+
     # 4) infer of the exported models
     ov_output_dir = tmp_path_test / "outputs" / "OPENVINO"
     ov_files = list(ov_output_dir.rglob("exported*.xml"))
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 0802ab8485f..cffbbcaa7ac 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -75,6 +75,8 @@ def get_task_list(task: str) -> list[OTXTaskType]:
         tasks = [OTXTaskType.ANOMALY_SEGMENTATION]
     elif task == "keypoint_detection":
         tasks = [OTXTaskType.KEYPOINT_DETECTION]
+    elif task == "object_detection_3d":
+        tasks = [OTXTaskType.OBJECT_DETECTION_3D]
     else:
         tasks = [OTXTaskType(task.upper())]
     return tasks
@@ -142,6 +144,7 @@ def fxt_target_dataset_per_task() -> dict:
         "anomaly_detection": "tests/assets/anomaly_hazelnut",
         "anomaly_segmentation": "tests/assets/anomaly_hazelnut",
         "keypoint_detection": "tests/assets/car_tree_bug_keypoint",
+        "object_detection_3d": "tests/assets/kitti3d",
         "tiling_detection": "tests/assets/tiling_small_objects",
     }
 
@@ -164,4 +167,5 @@ def fxt_cli_override_command_per_task() -> dict:
         "anomaly_detection": [],
         "anomaly_segmentation": [],
         "keypoint_detection": [],
+        "object_detection_3d": [],
     }
diff --git a/tests/perf/test_object_detection_3d.py b/tests/perf/test_object_detection_3d.py
new file mode 100644
index 00000000000..74a5bb43ca9
--- /dev/null
+++ b/tests/perf/test_object_detection_3d.py
@@ -0,0 +1,79 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+"""OTX 3d detection perfomance benchmark tests."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import ClassVar
+
+import pytest
+
+from .benchmark import Benchmark
+from .conftest import PerfTestBase
+
+
+class TestPerfObjectDetection3D(PerfTestBase):
+    """Benchmark visual prompting."""
+
+    MODEL_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Model(task="object_detection_3d", name="monodetr3d", category="balance"),
+    ]
+
+    DATASET_TEST_CASES: ClassVar = [
+        Benchmark.Dataset(
+            name="kitti_medium_pedestrian_cyclist",
+            path=Path("object_detection_3d/medium_pedestrian_cyclist"),
+            group="medium",
+            num_repeat=5,
+            extra_overrides={},
+        ),
+        Benchmark.Dataset(
+            name="kitti_large_car",
+            path=Path("object_detection_3d/large_car"),
+            group="large",
+            num_repeat=5,
+            extra_overrides={},
+        ),
+    ]
+
+    BENCHMARK_CRITERIA = [  # noqa: RUF012
+        Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1),
+        Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1),
+        Benchmark.Criterion(name="val/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
+    ]
+
+    @pytest.mark.parametrize(
+        "fxt_model",
+        MODEL_TEST_CASES,
+        ids=lambda model: model.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_dataset",
+        DATASET_TEST_CASES,
+        ids=lambda dataset: dataset.name,
+        indirect=True,
+    )
+    def test_perf(
+        self,
+        fxt_model: Benchmark.Model,
+        fxt_dataset: Benchmark.Dataset,
+        fxt_benchmark: Benchmark,
+    ):
+        self._test_perf(
+            model=fxt_model,
+            dataset=fxt_dataset,
+            benchmark=fxt_benchmark,
+            criteria=self.BENCHMARK_CRITERIA,
+        )
diff --git a/tox.ini b/tox.ini
index 74e20c98e7b..7aa0fa1ad5b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -27,6 +27,7 @@ task =
     anomaly_classification: "anomaly_classification"
     anomaly_detection: "anomaly_detection"
     anomaly_segmentation: "anomaly_segmentation"
+    object_detection_3d: "object_detection_3d"
 passenv =
     ftp_proxy
     HTTP_PROXY
@@ -55,7 +56,7 @@ commands =
         {posargs}
 
 
-[testenv:integration-test-{all, action, classification, multi_cls_classification, multi_label_classification, hlabel_classification, detection, rotated_detection, keypoint_detection, instance_segmentation, semantic_segmentation, visual_prompting_all, visual_prompting, zero_shot_visual_prompting, anomaly, anomaly_classification, anomaly_detection, anomaly_segmentation}]
+[testenv:integration-test-{all, action, classification, multi_cls_classification, multi_label_classification, hlabel_classification, detection, rotated_detection, keypoint_detection, instance_segmentation, semantic_segmentation, visual_prompting_all, visual_prompting, zero_shot_visual_prompting, anomaly, anomaly_classification, anomaly_detection, anomaly_segmentation, object_detection_3d}]
 setenv =
     CUBLAS_WORKSPACE_CONFIG=:4096:8
 deps =
@@ -64,7 +65,7 @@ commands =
     python -m pytest tests/integration -ra --showlocals --csv={toxworkdir}/{envname}.csv --task {[testenv]task} --open-subprocess {posargs}
 
 
-[testenv:e2e-test-{all, action, classification, detection, rotated_detection, keypoint_detection, instance_segmentation, semantic_segmentation, visual_prompting, anomaly}]
+[testenv:e2e-test-{all, action, classification, detection, rotated_detection, keypoint_detection, instance_segmentation, semantic_segmentation, visual_prompting, anomaly, object_detection_3d}]
 setenv =
     CUBLAS_WORKSPACE_CONFIG=:4096:8
 deps =