diff --git a/src/otx/algo/detection/heads/rtdetr_decoder.py b/src/otx/algo/detection/heads/rtdetr_decoder.py
index d60be84f12d..0c39358ed6d 100644
--- a/src/otx/algo/detection/heads/rtdetr_decoder.py
+++ b/src/otx/algo/detection/heads/rtdetr_decoder.py
@@ -236,8 +236,7 @@ def forward(
         value = self.value_proj(value)
         if value_mask is not None:
             value = value.masked_fill(value_mask[..., None], float(0))
-            # value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
-            # value3 = value * value_mask.unsqueeze(-1)
+
         value = value.reshape(bs, len_v, self.num_heads, self.head_dim)
 
         sampling_offsets = self.sampling_offsets(query).reshape(
diff --git a/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py b/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py
index a10c3c9dfd0..c2a5a0700e9 100644
--- a/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py
+++ b/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py
@@ -33,7 +33,7 @@ def __init__(
             num_pos_feats (int): Number of positional features.
             temperature (int): Temperature scaling factor.
             normalize (bool): Flag indicating whether to normalize the position embeddings.
-            scale (Optional[float]): Scaling factor for the position embeddings. If None, default value is used.
+            scale (float | None): Scaling factor for the position embeddings. If None, default value is used.
         """
         super().__init__()
         self.num_pos_feats = num_pos_feats
@@ -132,7 +132,7 @@ def __init__(
 
         Args:
             backbone (nn.Module): The backbone module.
-            position_embedding (Union[PositionEmbeddingSine]): The position embedding module.
+            position_embedding (PositionEmbeddingSine): The position embedding module.
         """
         super().__init__(backbone, position_embedding)
         self.strides = backbone.strides
diff --git a/src/otx/algo/object_detection_3d/detectors/monodetr.py b/src/otx/algo/object_detection_3d/detectors/monodetr.py
index d2cb4691b57..00ff404c718 100644
--- a/src/otx/algo/object_detection_3d/detectors/monodetr.py
+++ b/src/otx/algo/object_detection_3d/detectors/monodetr.py
@@ -149,12 +149,17 @@ def forward(
         """Forward method of the MonoDETR model.
 
         Args:
-            images (list[Tensor]): images for each sample
-            calibs (Tensor): camera matrices for each sample
-            img_sizes (Tensor): image sizes for each sample
-            targets (list[dict[Tensor]): ground truth boxes and labels for each
-                sample
+            images (Tensor): images for each sample.
+            calibs (Tensor): camera matrices for each sample.
+            img_sizes (Tensor): image sizes for each sample.
+            targets (list[dict[str, Tensor]): ground truth boxes and labels for each
+                sample. Defaults to None.
             mode (str): The mode of operation. Defaults to "predict".
+
+        Returns:
+                dict[str, Tensor]: A dictionary of tensors. If mode is "loss", the
+                tensors are the loss values. If mode is "predict", the tensors are
+                the logits.
         """
         features, pos = self.backbone(images)
 
diff --git a/src/otx/algo/object_detection_3d/heads/depth_predictor.py b/src/otx/algo/object_detection_3d/heads/depth_predictor.py
index 4e5037c96d8..87827144b21 100644
--- a/src/otx/algo/object_detection_3d/heads/depth_predictor.py
+++ b/src/otx/algo/object_detection_3d/heads/depth_predictor.py
@@ -32,6 +32,8 @@ def __init__(
             depth_min (float): The minimum depth value.
             depth_max (float): The maximum depth value.
             hidden_dim (int): The dimension of the hidden layer.
+            activation (Callable[..., nn.Module], optional): The activation function.
+                Defaults to nn.ReLU.
         """
         super().__init__()
         self.depth_max = depth_max
diff --git a/src/otx/algo/object_detection_3d/losses/ddn_loss.py b/src/otx/algo/object_detection_3d/losses/ddn_loss.py
index e3a4238be03..671033a347a 100644
--- a/src/otx/algo/object_detection_3d/losses/ddn_loss.py
+++ b/src/otx/algo/object_detection_3d/losses/ddn_loss.py
@@ -22,13 +22,13 @@ def compute_fg_mask(
     """Compute foreground mask for images.
 
     Args:
-        gt_boxes2d [torch.Tensor(B, N, 4)]: 2D box labels
-        shape [Tuple[int, int]]: Foreground mask desired shape
-        downsample_factor [int]: Downsample factor for image
-        device [torch.device]: Foreground mask desired device
+        gt_boxes2d (torch.Tensor): 2D box labels.
+        shape (Tuple[int, int]): Foreground mask desired shape.
+        downsample_factor (int): Downsample factor for image.
+        device (torch.device): Foreground mask desired device.
 
     Returns:
-        fg_mask [torch.Tensor(shape)]: Foreground mask
+        fg_mask (torch.Tensor(shape)]: Foreground mask.
     """
     if device is None:
         device = torch.device("cpu")
@@ -58,9 +58,9 @@ def __init__(self, fg_weight: float, bg_weight: float, downsample_factor: int =
         """Initialize fixed foreground/background loss balancer.
 
         Args:
-            fg_weight [float]: Foreground loss weight
-            bg_weight [float]: Background loss weight
-            downsample_factor [int]: Depth map downsample factor
+            fg_weight (float): Foreground loss weight.
+            bg_weight (float): Background loss weight.
+            downsample_factor (int): Depth map downsample factor.
         """
         super().__init__()
         self.fg_weight = fg_weight
@@ -76,12 +76,11 @@ def forward(
         """Forward pass.
 
         Args:
-            loss [torch.Tensor(B, H, W)]: Pixel-wise loss
-            gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing
+            loss (torch.Tensor): Pixel-wise loss.
+            gt_boxes2d (torch.Tensor): 2D box labels for foreground/background balancing.
 
         Returns:
-            loss [torch.Tensor(1)]: Total loss after foreground/background balancing
-            tb_dict [dict[float]]: All losses to log in tensorboard
+            loss (torch.Tensor): Total loss after foreground/background balancing.
         """
         # Compute masks
         fg_mask = compute_fg_mask(
@@ -120,13 +119,11 @@ def __init__(
         """Initializes DDNLoss module.
 
         Args:
-            weight [float]: Loss function weight
-            alpha [float]: Alpha value for Focal Loss
-            gamma [float]: Gamma value for Focal Loss
-            disc_cfg [dict]: Depth discretiziation configuration
-            fg_weight [float]: Foreground loss weight
-            bg_weight [float]: Background loss weight
-            downsample_factor [int]: Depth map downsample factor
+            alpha (float): Alpha value for Focal Loss.
+            gamma (float): Gamma value for Focal Loss.
+            fg_weight (float): Foreground loss weight.
+            bg_weight (float): Background loss weight.
+            downsample_factor (int): Depth map downsample factor.
         """
         super().__init__()
         self.balancer = Balancer(downsample_factor=downsample_factor, fg_weight=fg_weight, bg_weight=bg_weight)
@@ -146,10 +143,10 @@ def build_target_depth_from_3dcenter(
         """Builds target depth map from 3D center depth.
 
         Args:
-            depth_logits: torch.Tensor(B, D+1, H, W)]: Predicted depth logits
-            gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing
-            gt_center_depth [torch.Tensor(B, N)]: 3D center depth
-            num_gt_per_img: [int]: Number of ground truth boxes per image
+            depth_logits: (torch.Tensor): Predicted depth logits.
+            gt_boxes2d (torch.Tensor)): 2D box labels for foreground/background balancing.
+            gt_center_depth (torch.Tensor): 3D center depth.
+            num_gt_per_img: (int): Number of ground truth boxes per image.
         """
         b, _, h, w = depth_logits.shape
         depth_maps = torch.zeros((b, h, w), device=depth_logits.device, dtype=depth_logits.dtype)
@@ -185,18 +182,18 @@ def bin_depths(
         """Converts depth map into bin indices.
 
         Args:
-            depth_map [torch.Tensor(H, W)]: Depth Map
-            mode [string]: Discretiziation mode (See https://arxiv.org/pdf/2005.13423.pdf for more details)
-                UD: Uniform discretiziation
-                LID: Linear increasing discretiziation
-                SID: Spacing increasing discretiziation
-            depth_min [float]: Minimum depth value
-            depth_max [float]: Maximum depth value
-            num_bins [int]: Number of depth bins
-            target [bool]: Whether the depth bins indices will be used for a target tensor in loss comparison
+            depth_map (torch.Tensor): Depth Map.
+            mode (string): Discretiziation mode (See https://arxiv.org/pdf/2005.13423.pdf for more details).
+                UD: Uniform discretiziation.
+                LID: Linear increasing discretiziation.
+                SID: Spacing increasing discretiziation.
+            depth_min (float): Minimum depth value.
+            depth_max (float): Maximum depth value.
+            num_bins (int): Number of depth bins.
+            target (bool): Whether the depth bins indices will be used for a target tensor in loss comparison.
 
         Returns:
-            indices [torch.Tensor(H, W)]: Depth bin indices
+            indices (torch.Tensor): Depth bin indices.
         """
         if mode == "UD":
             bin_size = (depth_max - depth_min) / num_bins
@@ -233,13 +230,13 @@ def forward(
         """Gets depth_map loss.
 
         Args:
-            depth_logits: torch.Tensor(B, D+1, H, W)]: Predicted depth logits
-            gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing
-            num_gt_per_img: [int]: Number of ground truth boxes per image
-            gt_center_depth: [torch.Tensor(B, N)]: 3D center depth
+            depth_logits: (torch.Tensor): Predicted depth logits.
+            gt_boxes2d (torch.Tensor): 2D box labels for foreground/background balancing.
+            num_gt_per_img: (int): Number of ground truth boxes per image.
+            gt_center_depth: (torch.Tensor): 3D center depth.
 
         Returns:
-            loss [torch.Tensor(1)]: Depth classification network loss
+            loss (torch.Tensor): Depth classification network loss.
         """
         # Bin depth map to create target
         depth_maps = self.build_target_depth_from_3dcenter(depth_logits, gt_boxes2d, gt_center_depth, num_gt_per_img)
diff --git a/src/otx/algo/object_detection_3d/losses/monodetr_loss.py b/src/otx/algo/object_detection_3d/losses/monodetr_loss.py
index ebc98d45a51..0f2d85d0565 100644
--- a/src/otx/algo/object_detection_3d/losses/monodetr_loss.py
+++ b/src/otx/algo/object_detection_3d/losses/monodetr_loss.py
@@ -29,11 +29,10 @@ def __init__(self, num_classes: int, weight_dict: dict, focal_alpha: float, grou
         """MonoDETRCriterion.
 
         Args:
-            num_classes: number of object categories, omitting the special no-object category
-            matcher: module able to compute a matching between targets and proposals
-            weight_dict: dict containing as key the names of the losses and as values their relative weight.
-            focal_alpha: alpha in Focal Loss
-            group_num: number of groups for data parallelism
+            num_classes (int): number of object categories, omitting the special no-object category.
+            weight_dict (dict): dict containing as key the names of the losses and as values their relative weight.
+            focal_alpha (float): alpha in Focal Loss.
+            group_num (int): number of groups for data parallelism.
         """
         super().__init__()
         self.num_classes = num_classes
@@ -47,7 +46,15 @@ def __init__(self, num_classes: int, weight_dict: dict, focal_alpha: float, grou
         self.group_num = group_num
 
     def loss_labels(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Classification loss."""
+        """Classification loss.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         src_logits = outputs["scores"]
 
         idx = self._get_src_permutation_idx(indices)
@@ -76,7 +83,15 @@ def loss_labels(self, outputs: dict, targets: list, indices: list, num_boxes: in
         return {"loss_ce": loss_ce}
 
     def loss_3dcenter(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute the loss for the 3D center prediction."""
+        """Compute the loss for the 3D center prediction.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         idx = self._get_src_permutation_idx(indices)
         src_3dcenter = outputs["boxes_3d"][:, :, 0:2][idx]
         target_3dcenter = torch.cat([t["boxes_3d"][:, 0:2][i] for t, (_, i) in zip(targets, indices)], dim=0)
@@ -85,7 +100,15 @@ def loss_3dcenter(self, outputs: dict, targets: list, indices: list, num_boxes:
         return {"loss_center": loss_3dcenter.sum() / num_boxes}
 
     def loss_boxes(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute l1 loss."""
+        """Compute l1 loss.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         idx = self._get_src_permutation_idx(indices)
         src_2dboxes = outputs["boxes_3d"][:, :, 2:6][idx]
         target_2dboxes = torch.cat([t["boxes_3d"][:, 2:6][i] for t, (_, i) in zip(targets, indices)], dim=0)
@@ -95,7 +118,15 @@ def loss_boxes(self, outputs: dict, targets: list, indices: list, num_boxes: int
         return {"loss_bbox": loss_bbox.sum() / num_boxes}
 
     def loss_giou(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute the GIoU loss."""
+        """Compute the GIoU loss.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         # giou
         idx = self._get_src_permutation_idx(indices)
         src_boxes = outputs["boxes_3d"][idx]
@@ -104,7 +135,15 @@ def loss_giou(self, outputs: dict, targets: list, indices: list, num_boxes: int)
         return {"loss_giou": loss_giou}
 
     def loss_depths(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute the loss for the depth prediction."""
+        """Compute the loss for the depth prediction.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch
+        """
         idx = self._get_src_permutation_idx(indices)
 
         src_depths = outputs["depth"][idx]
@@ -117,7 +156,15 @@ def loss_depths(self, outputs: dict, targets: list, indices: list, num_boxes: in
         return {"loss_depth": depth_loss.sum() / num_boxes}
 
     def loss_dims(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute the loss for the dimension prediction."""
+        """Compute the loss for the dimension prediction.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         idx = self._get_src_permutation_idx(indices)
         src_dims = outputs["size_3d"][idx]
         target_dims = torch.cat([t["size_3d"][i] for t, (_, i) in zip(targets, indices)], dim=0)
@@ -131,7 +178,15 @@ def loss_dims(self, outputs: dict, targets: list, indices: list, num_boxes: int)
         return {"loss_dim": dim_loss.sum() / num_boxes}
 
     def loss_angles(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute the loss for the angle prediction."""
+        """Compute the loss for the angle prediction.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         idx = self._get_src_permutation_idx(indices)
         heading_input = outputs["heading_angle"][idx]
         target_heading_angle = torch.cat([t["heading_angle"][i] for t, (_, i) in zip(targets, indices)], dim=0)
@@ -158,7 +213,15 @@ def loss_angles(self, outputs: dict, targets: list, indices: list, num_boxes: in
         return {"loss_angle": angle_loss.sum() / num_boxes}
 
     def loss_depth_map(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Depth map loss."""
+        """Depth map loss.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         depth_map_logits = outputs["pred_depth_map_logits"]
 
         num_gt_per_img = [len(t["boxes"]) for t in targets]
@@ -174,6 +237,7 @@ def _get_src_permutation_idx(
         self,
         indices: list[tuple[torch.Tensor, torch.Tensor]],
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Get the indices necessary to compute the loss."""
         # permute predictions following indices
         batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
         src_idx = torch.cat([src for (src, _) in indices])
@@ -183,6 +247,7 @@ def _get_tgt_permutation_idx(
         self,
         indices: list[tuple[torch.Tensor, torch.Tensor]],
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Get the indices necessary to compute the loss."""
         # permute targets following indices
         batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
         tgt_idx = torch.cat([tgt for (_, tgt) in indices])
@@ -210,9 +275,9 @@ def forward(
         """This performs the loss computation.
 
         Args:
-             outputs: dict of tensors, see the output specification of the model for the format
-             targets: list of dicts, such that len(targets) == batch_size.
-                      The expected keys in each dict depends on the losses applied, see each loss' doc
+             outputs (dict): dict of tensors, see the output specification of the model for the format.
+             targets (list): list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc.
         """
         outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
         group_num = self.group_num if self.training else 1
diff --git a/src/otx/algo/object_detection_3d/monodetr3d.py b/src/otx/algo/object_detection_3d/monodetr3d.py
index f0f95ea714f..18d3c072556 100644
--- a/src/otx/algo/object_detection_3d/monodetr3d.py
+++ b/src/otx/algo/object_detection_3d/monodetr3d.py
@@ -7,19 +7,13 @@
 
 from typing import Any
 
-import numpy as np
 import torch
-from torch import Tensor
-from torchvision.ops import box_convert
 
 from otx.algo.object_detection_3d.backbones.monodetr_resnet import BackboneBuilder
 from otx.algo.object_detection_3d.detectors.monodetr import MonoDETR
 from otx.algo.object_detection_3d.heads.depth_predictor import DepthPredictor
 from otx.algo.object_detection_3d.heads.depthaware_transformer import DepthAwareTransformerBuilder
 from otx.algo.object_detection_3d.losses import MonoDETRCriterion
-from otx.algo.object_detection_3d.utils.utils import box_cxcylrtb_to_xyxy
-from otx.core.data.entity.base import OTXBatchLossEntity
-from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity, Det3DBatchPredEntity
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.detection_3d import OTXObjectDetection3DExporter
 from otx.core.model.detection_3d import OTX3DDetectionModel
@@ -30,7 +24,6 @@ class MonoDETR3D(OTX3DDetectionModel):
 
     mean: tuple[float, float, float] = (123.675, 116.28, 103.53)
     std: tuple[float, float, float] = (58.395, 57.12, 57.375)
-    input_size: tuple[int, int] = (384, 1280)  # HxW
     load_from: str | None = None
 
     def _build_model(self, num_classes: int) -> MonoDETR:
@@ -62,73 +55,6 @@ def _build_model(self, num_classes: int) -> MonoDETR:
             init_box=False,
         )
 
-    def _customize_inputs(
-        self,
-        entity: Det3DBatchDataEntity,
-    ) -> dict[str, Any]:
-        # prepare bboxes for the model
-        targets_list = []
-        img_sizes = torch.from_numpy(np.array([img_info.ori_shape for img_info in entity.imgs_info])).to(
-            device=entity.images.device,
-        )
-        key_list = ["labels", "boxes", "depth", "size_3d", "heading_angle", "boxes_3d"]
-        for bz in range(len(entity.imgs_info)):
-            target_dict = {}
-            for key in key_list:
-                target_dict[key] = getattr(entity, key)[bz]
-            targets_list.append(target_dict)
-
-        return {
-            "images": entity.images,
-            "calibs": torch.cat([p2.unsqueeze(0) for p2 in entity.calib_matrix], dim=0),
-            "targets": targets_list,
-            "img_sizes": img_sizes,
-            "mode": "loss" if self.training else "predict",
-        }
-
-    def _customize_outputs(
-        self,
-        outputs: dict[str, torch.Tensor],
-        inputs: Det3DBatchDataEntity,
-    ) -> Det3DBatchPredEntity | OTXBatchLossEntity:
-        if self.training:
-            if not isinstance(outputs, dict):
-                raise TypeError(outputs)
-
-            losses = OTXBatchLossEntity()
-            for k, v in outputs.items():
-                if isinstance(v, list):
-                    losses[k] = sum(v)
-                elif isinstance(v, Tensor):
-                    losses[k] = v
-                else:
-                    msg = "Loss output should be list or torch.tensor but got {type(v)}"
-                    raise TypeError(msg)
-            return losses
-
-        labels, scores, size_3d, heading_angle, boxes_3d, depth = self.extract_dets_from_outputs(outputs)
-        # bbox 2d decoding
-        boxes_2d = box_cxcylrtb_to_xyxy(boxes_3d)
-        xywh_2d = box_convert(boxes_2d, "xyxy", "cxcywh")
-        # size 2d decoding
-        size_2d = xywh_2d[:, :, 2:4]
-
-        return Det3DBatchPredEntity(
-            batch_size=inputs.batch_size,
-            images=inputs.images,
-            imgs_info=inputs.imgs_info,
-            calib_matrix=inputs.calib_matrix,
-            boxes=boxes_2d,
-            labels=labels,
-            boxes_3d=boxes_3d,
-            size_2d=size_2d,
-            size_3d=size_3d,
-            depth=depth,
-            heading_angle=heading_angle,
-            scores=scores,
-            original_kitti_format=[None],
-        )
-
     def configure_optimizers(self) -> tuple[list[torch.optim.Optimizer], list[dict[str, Any]]]:
         """Configure an optimizer and learning-rate schedulers.
 
diff --git a/src/otx/core/data/entity/object_detection_3d.py b/src/otx/core/data/entity/object_detection_3d.py
index 564ea283a60..8be60c089fb 100644
--- a/src/otx/core/data/entity/object_detection_3d.py
+++ b/src/otx/core/data/entity/object_detection_3d.py
@@ -26,11 +26,18 @@
 @register_pytree_node
 @dataclass
 class Det3DDataEntity(OTXDataEntity):
-    """Data entity for detection task.
+    """Data entity for 3d object detection task.
+
+    : param boxes (tv_tensors.BoundingBoxes): The bounding boxes for the objects in the image.
+    : param calib_matrix (Tensor): The calibration matrix for the 3D object detection.
+    : param boxes_3d (Tensor): The 3D bounding boxes for the objects.
+    : param size_2d (Tensor): The 2D size of the objects.
+    : param size_3d (Tensor): The 3D size of the objects.
+    : param depth (Tensor): The depth of the objects.
+    : param heading_angle (Tensor): The heading angle of the objects.
+    : param labels (LongTensor): The labels of the objects.
+    : param original_kitti_format (list[dict[str, Any]] | None): The original KITTI format of the objects, if available.
 
-    :param bboxes: Bbox annotations as top-left-bottom-right
-        (x1, y1, x2, y2) format with absolute coordinate values
-    :param labels: Bbox labels as integer indices
     """
 
     @property
@@ -51,17 +58,24 @@ def task(self) -> OTXTaskType:
 
 @dataclass
 class Det3DPredEntity(OTXPredEntity, Det3DDataEntity):
-    """Data entity to represent the detection model output prediction."""
+    """Data entity to represent the 3d object detection model output prediction."""
 
 
 @dataclass
 class Det3DBatchDataEntity(OTXBatchDataEntity[Det3DDataEntity]):
-    """Data entity for detection task.
-
-    :param bboxes: A list of bbox annotations as top-left-bottom-right
-        (x1, y1, x2, y2) format with absolute coordinate values
-    :param labels: A list of bbox labels as integer indices
-    """  # TODO(Kirill): UPDATE!
+    """Data entity for 3d object detection task.
+
+    : param boxes list[tv_tensors.BoundingBoxes]: The bounding boxes for the objects in the image.
+    : param calib_matrix list[Tensor]: The calibration matrix for the 3D object detection.
+    : param boxes_3d list[Tensor]: The 3D bounding boxes for the objects.
+    : param size_2d list[Tensor]: The 2D size of the objects.
+    : param size_3d list[Tensor]: The 3D size of the objects.
+    : param depth list[Tensor]: The depth of the objects.
+    : param heading_angle list[Tensor]: The heading angle of the objects.
+    : param labels list[LongTensor]: The labels of the objects.
+    : param original_kitti_format list[list[dict[str, Any]] | None]: The original KITTI format of the objects,
+        if available. Needed for validation and KITTI metric.
+    """
 
     images: Tensor
     boxes: list[tv_tensors.BoundingBoxes]
@@ -135,7 +149,7 @@ def pin_memory(self) -> Det3DBatchDataEntity:
 
 @dataclass
 class Det3DBatchPredEntity(OTXBatchPredEntity, Det3DBatchDataEntity):
-    """Data entity to represent model output predictions for detection task."""
+    """Data entity to represent model output predictions for 3d object detection task."""
 
     boxes: tv_tensors.BoundingBoxes
     scores: Tensor
diff --git a/src/otx/core/exporter/base.py b/src/otx/core/exporter/base.py
index 85d77fe4799..cfbc670e58e 100644
--- a/src/otx/core/exporter/base.py
+++ b/src/otx/core/exporter/base.py
@@ -45,6 +45,9 @@ class OTXModelExporter:
         output_names (list[str] | None, optional): Names for model's outputs, which would be
         embedded into resulting model. Note, that order of the output names should be the same,
         as in the target model.
+        input_names (list[str] | None, optional): Names for model's inputs, which would be
+        embedded into resulting model. Note, that order of the input names should be the same,
+        as in the target model.
     """
 
     def __init__(
diff --git a/src/otx/core/metrics/average_precision_3d.py b/src/otx/core/metrics/average_precision_3d.py
index 7b8530ba684..2600200280b 100644
--- a/src/otx/core/metrics/average_precision_3d.py
+++ b/src/otx/core/metrics/average_precision_3d.py
@@ -7,8 +7,10 @@
 
 from typing import TYPE_CHECKING
 
+import torch
 from torch import Tensor
 from torchmetrics import Metric
+from torchmetrics.detection.mean_ap import MeanAveragePrecision
 
 from otx.core.metrics.kitti_3d_eval import get_coco_eval_result
 
@@ -32,6 +34,7 @@ def __init__(
         super().__init__()
 
         self.label_info: LabelInfo = label_info
+        self.mean_ap: MeanAveragePrecision = MeanAveragePrecision(box_format="xyxy", iou_type="bbox")
         self.reset()
 
     def reset(self) -> None:
@@ -42,6 +45,7 @@ def reset(self) -> None:
         super().reset()
         self.preds: list[dict[str, np.array]] = []
         self.targets: list[dict[str, np.array]] = []
+        self.mean_ap.reset()
 
     def update(self, preds: list[dict[str, Tensor]], target: list[dict[str, Tensor]]) -> None:
         """Update total predictions and targets from given batch predicitons and targets."""
@@ -51,13 +55,35 @@ def update(self, preds: list[dict[str, Tensor]], target: list[dict[str, Tensor]]
     def compute(self) -> dict:
         """Compute metrics for 3d object detection."""
         current_classes = self.label_info.label_names
-        map_bbox, map_3d = get_coco_eval_result(
+        preds_for_torchmetrics = self.prepare_inputs_for_map_coco(self.preds)
+        targets_for_torchmetrics = self.prepare_inputs_for_map_coco(self.targets)
+        ap_bbox_coco = self.mean_ap(preds_for_torchmetrics, targets_for_torchmetrics)
+        ap_3d = get_coco_eval_result(
             self.targets,
             self.preds,
             current_classes=[curcls.lower() for curcls in current_classes],
         )
-        # use moderate difficulty as final score. Average across all calsses.
-        return {"mAP_bbox_3d": Tensor([map_3d[:, 1].mean()]), "mAP_bbox_2d": Tensor([map_bbox[:, 1].mean()])}
+        # Average across all classes.
+        return {
+            "AP_3d@0.5": Tensor([ap_3d[0]]),
+            "AP_2d@0.5": ap_bbox_coco["map_50"],
+            "mAP_3d": Tensor([ap_3d.mean()]),
+            "mAP_2d": ap_bbox_coco["map"],
+        }
+
+    def prepare_inputs_for_map_coco(self, targets: list[dict[str, np.array]]) -> list[dict[str, Tensor]]:
+        """Prepare targets for torchmetrics."""
+        return [
+            {
+                "boxes": torch.tensor(target["bbox"]),
+                "scores": torch.tensor(target["score"]) if "score" in target else None,
+                "labels": torch.tensor(
+                    [self.label_info.label_names.index(label) for label in target["name"]],
+                    dtype=torch.long,
+                ),
+            }
+            for target in targets
+        ]
 
 
 def _kitti_metric_measure_callable(label_info: LabelInfo) -> KittiMetric:
diff --git a/src/otx/core/metrics/kitti_3d_eval/eval.py b/src/otx/core/metrics/kitti_3d_eval/eval.py
index 86f634243a4..34144fa4797 100644
--- a/src/otx/core/metrics/kitti_3d_eval/eval.py
+++ b/src/otx/core/metrics/kitti_3d_eval/eval.py
@@ -6,6 +6,7 @@
 
 from __future__ import annotations
 
+import logging
 from typing import Any
 
 import numba
@@ -18,44 +19,11 @@
     from .rotate_iou import rotate_iou_eval_cpu as rotate_iou_eval
 
 
-@numba.jit(nopython=True)
-def get_thresholds(
-    scores: np.ndarray,  # 1D array of confidence scores
-    num_gt: int,  # Number of ground truth objects
-    num_sample_pts: int = 41,  # Number of sample points used to compute recall thresholds
-) -> np.ndarray:  # 1D array of recall thresholds
-    """Compute recall thresholds for a given score array.
-
-    Args:
-        scores (np.ndarray): 1D array of confidence scores.
-        num_gt (int): Number of ground truth objects.
-        num_sample_pts (int, optional): Number of sample points used to
-            compute recall thresholds. Defaults to 41.
-
-    Returns:
-        np.ndarray: 1D array of recall thresholds.
-    """
-    scores.sort()
-    scores = scores[::-1]
-    current_recall = 0.0
-    thresholds = []
-    for i, score in enumerate(scores):
-        l_recall = (i + 1) / num_gt
-        r_recall = (i + 2) / num_gt if i < len(scores) - 1 else l_recall
-        if ((r_recall - current_recall) < (current_recall - l_recall)) and (i < (len(scores) - 1)):
-            continue
-        # recall = l_recall
-        thresholds.append(score)
-        current_recall += 1 / (num_sample_pts - 1.0)
-    return thresholds
-
-
 def clean_data(
     gt_anno: dict,  # ground truth annotations
     dt_anno: dict,  # detection results
     current_class: str,  # the current class name
-    difficulty: int,  # the difficulty level
-) -> tuple:  # (num_valid_gt, ignored_gt, ignored_dt, dc_bboxes)
+) -> tuple:  # (num_valid_gt, ignored_gt, ignored_dt)
     """Filter out the objects that are not in the current class.
 
     Args:
@@ -65,12 +33,12 @@ def clean_data(
         difficulty (int): The difficulty level.
 
     Returns:
-        tuple: The number of valid objects, ignored_gt, ignored_dt, and dc_bboxes.
+        tuple: The number of valid objects, ignored_gt, ignored_dt.
     """
-    min_height = [40, 25, 25]
-    max_occlusion = [0, 1, 2]
-    max_truncation = [0.15, 0.3, 0.5]
-    dc_bboxes, ignored_gt, ignored_dt = [], [], []
+    min_height = 20
+    max_occlusion = 2
+    max_truncation = 0.5
+    ignored_gt, ignored_dt = [], []
     num_gt = len(gt_anno["name"])
     num_dt = len(dt_anno["name"])
     num_valid_gt = 0
@@ -89,11 +57,10 @@ def clean_data(
             valid_class = -1
         ignore = False
         if (
-            (gt_anno["occluded"][i] > max_occlusion[difficulty])
-            or (gt_anno["truncated"][i] > max_truncation[difficulty])
-            or (height <= min_height[difficulty])
-        ):
-            # if gt_anno["difficulty"][i] > difficulty or gt_anno["difficulty"][i] == -1:
+            (gt_anno["occluded"][i] > max_occlusion)
+            or (gt_anno["truncated"][i] > max_truncation)
+            or (height <= min_height)
+        ):  # filter extrim cases
             ignore = True
         if valid_class == 1 and not ignore:
             ignored_gt.append(0)
@@ -102,59 +69,18 @@ def clean_data(
             ignored_gt.append(1)
         else:
             ignored_gt.append(-1)
-        # for i in range(num_gt):
-        if gt_anno["name"][i] == "dontcare":
-            dc_bboxes.append(gt_anno["bbox"][i])
+
     for i in range(num_dt):
         valid_class = 1 if dt_anno["name"][i].lower() == current_class else -1
         height = abs(dt_anno["bbox"][i, 3] - dt_anno["bbox"][i, 1])
-        if height < min_height[difficulty]:
+        if height < min_height:
             ignored_dt.append(1)
         elif valid_class == 1:
             ignored_dt.append(0)
         else:
             ignored_dt.append(-1)
 
-    return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes
-
-
-@numba.jit(nopython=True)
-def image_box_overlap(
-    boxes: np.ndarray,  # shape: (N, 4)
-    query_boxes: np.ndarray,  # shape: (K, 4)
-    criterion: int = -1,  # default overlap criterion: intersection over union
-) -> np.ndarray:  # shape: (N, K)
-    """Image box overlap.
-
-    Args:
-        boxes (np.ndarray): shape: (N, 4), 2D boxes, (x1, y1, x2, y2)
-        query_boxes (np.ndarray): shape: (K, 4), 2D boxes, (x1, y1, x2, y2)
-        criterion (int, optional): overlap criterion, -1: intersection over union,
-            0: intersection over box area, 1: intersection over query box area. Defaults to -1.
-
-    Returns:
-        np.ndarray: shape: (N, K), overlap between boxes and query_boxes
-    """
-    num_n = boxes.shape[0]
-    num_k = query_boxes.shape[0]
-    overlaps = np.zeros((num_n, num_k), dtype=boxes.dtype)
-    for k in range(num_k):
-        qbox_area = (query_boxes[k, 2] - query_boxes[k, 0]) * (query_boxes[k, 3] - query_boxes[k, 1])
-        for n in range(num_n):
-            iw = min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0])
-            if iw > 0:
-                ih = min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1])
-                if ih > 0:
-                    if criterion == -1:
-                        ua = (boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1]) + qbox_area - iw * ih
-                    elif criterion == 0:
-                        ua = (boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1])
-                    elif criterion == 1:
-                        ua = qbox_area
-                    else:
-                        ua = 1.0
-                    overlaps[n, k] = iw * ih / ua
-    return overlaps
+    return num_valid_gt, ignored_gt, ignored_dt
 
 
 @numba.jit(nopython=True)
@@ -184,8 +110,6 @@ def d3_box_overlap_kernel(
     for i in range(n):
         for j in range(k):
             if rinc[i, j] > 0:
-                # iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] +
-                #         qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1]))
                 iw = min(boxes[i, 1], qboxes[j, 1]) - max(boxes[i, 1] - boxes[i, 4], qboxes[j, 1] - qboxes[j, 4])
 
                 if iw > 0:
@@ -206,14 +130,12 @@ def d3_box_overlap_kernel(
 
 
 @numba.jit(nopython=True)
-def compute_statistics_jit(  # noqa: C901
+def compute_statistics_jit(
     overlaps: np.ndarray,  # shape: (total_dt_num, total_gt_num)
     gt_datas: np.ndarray,  # shape: (total_gt_num, 7)
     dt_datas: np.ndarray,  # shape: (total_dt_num, 7)
     ignored_gt: list[int],  # shape: (total_gt_num)
     ignored_det: list[int],  # shape: (total_dt_num)
-    dc_bboxes: np.ndarray,  # shape: (total_dc_num, 4)
-    metric: int,
     min_overlap: float,
     thresh: float = 0,
     compute_fp: bool = False,
@@ -226,8 +148,6 @@ def compute_statistics_jit(  # noqa: C901
         dt_datas (np.ndarray): Detection data.
         ignored_gt (List[int]): Ignore ground truth indices.
         ignored_det (List[int]): Ignore detection indices.
-        dc_bboxes (np.ndarray): Don't care bboxes.
-        metric (int): Evaluation metric.
         min_overlap (float): Minimum overlap between dt and gt bboxes.
         thresh (float): Detection score threshold. Defaults to 0.
         compute_fp (bool): Whether to compute false positives. Defaults to False.
@@ -238,17 +158,16 @@ def compute_statistics_jit(  # noqa: C901
     det_size = dt_datas.shape[0]
     gt_size = gt_datas.shape[0]
     dt_scores = dt_datas[:, -1]
-    dt_bboxes = dt_datas[:, :4]
 
     assigned_detection = [False] * det_size
-    ignored_threshold = [False] * det_size
+    ignored_obj_by_threshold = [False] * det_size
     if compute_fp:
         for i in range(det_size):
             if dt_scores[i] < thresh:
-                ignored_threshold[i] = True
+                ignored_obj_by_threshold[i] = True
     no_detection = -10000000
     tp, fp, fn, similarity = 0, 0, 0, 0
-    thresholds = np.zeros((gt_size,))
+    tp_scores = np.zeros((gt_size,))
     thresh_idx = 0
     for i in range(gt_size):
         if ignored_gt[i] == -1:
@@ -263,7 +182,7 @@ def compute_statistics_jit(  # noqa: C901
                 continue
             if assigned_detection[j]:
                 continue
-            if ignored_threshold[j]:
+            if ignored_obj_by_threshold[j]:
                 continue
             overlap = overlaps[j, i]
             dt_score = dt_scores[j]
@@ -291,32 +210,21 @@ def compute_statistics_jit(  # noqa: C901
             assigned_detection[det_idx] = True
         elif valid_detection != no_detection:
             tp += 1
-            # thresholds.append(dt_scores[det_idx])
-            thresholds[thresh_idx] = dt_scores[det_idx]
+
+            tp_scores[thresh_idx] = dt_scores[det_idx]
             thresh_idx += 1
 
             assigned_detection[det_idx] = True
     if compute_fp:
         for i in range(det_size):
-            if not (assigned_detection[i] or ignored_det[i] == -1 or ignored_det[i] == 1 or ignored_threshold[i]):
+            if not (
+                assigned_detection[i] or ignored_det[i] == -1 or ignored_det[i] == 1 or ignored_obj_by_threshold[i]
+            ):
                 fp += 1
         nstuff = 0
-        if metric == 0:
-            overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0)
-            for i in range(dc_bboxes.shape[0]):
-                for j in range(det_size):
-                    if assigned_detection[j]:
-                        continue
-                    if ignored_det[j] == -1 or ignored_det[j] == 1:
-                        continue
-                    if ignored_threshold[j]:
-                        continue
-                    if overlaps_dt_dc[j, i] > min_overlap:
-                        assigned_detection[j] = True
-                        nstuff += 1
         fp -= nstuff
 
-    return tp, fp, fn, similarity, thresholds[:thresh_idx]
+    return tp, fp, fn, similarity, tp_scores[:thresh_idx]
 
 
 @numba.jit(nopython=True)
@@ -346,13 +254,10 @@ def fused_compute_statistics(
     pr: np.ndarray,  # shape: (num_thresholds, 4)
     gt_nums: np.ndarray,  # shape: (num_samples)
     dt_nums: np.ndarray,  # shape: (num_samples)
-    dc_nums: np.ndarray,  # shape: (num_samples)
     gt_datas: np.ndarray,  # shape: (total_gt_num, 7)
     dt_datas: np.ndarray,  # shape: (total_dt_num, 7)
-    dontcares: np.ndarray,  # shape: (total_dc_num, 4)
     ignored_gts: np.ndarray,  # shape: (total_gt_num)
     ignored_dets: np.ndarray,  # shape: (total_dt_num)
-    metric: int,
     min_overlap: float,
     thresholds: np.ndarray,  # shape: (num_thresholds)
 ) -> None:
@@ -371,26 +276,20 @@ def fused_compute_statistics(
             gt_nums[i] is the number of ground truths in i-th sample
         dt_nums (np.ndarray): 1D array of shape (num_samples),
             dt_nums[i] is the number of detections in i-th sample
-        dc_nums (np.ndarray): 1D array of shape (num_samples),
-            dc_nums[i] is the number of dontcare areas in i-th sample
         gt_datas (np.ndarray): 2D array of shape (total_gt_num, 7),
             gt_datas[i] is the i-th ground truth box
         dt_datas (np.ndarray): 2D array of shape (total_dt_num, 7),
             dt_datas[i] is the i-th detection box
-        dontcares (np.ndarray): 2D array of shape (total_dc_num, 4),
-            dontcares[i] is the i-th dontcare area
         ignored_gts (np.ndarray): 1D array of shape (total_gt_num),
             ignored_gts[i] is 1 if the i-th ground truth is ignored, 0 otherwise
         ignored_dets (np.ndarray): 1D array of shape (total_dt_num),
             ignored_dets[i] is 1 if the i-th detection is ignored, 0 otherwise
-        metric (int): Eval type. 0: bbox, 1: bev, 2: 3d
         min_overlap (float): Min overlap
         thresholds (np.ndarray): 1D array of shape (num_thresholds),
             thresholds[i] is the i-th threshold
     """
     gt_num = 0
     dt_num = 0
-    dc_num = 0
     for i in range(gt_nums.shape[0]):
         for t, thresh in enumerate(thresholds):
             overlap = overlaps[dt_num : dt_num + dt_nums[i], gt_num : gt_num + gt_nums[i]]
@@ -398,15 +297,12 @@ def fused_compute_statistics(
             dt_data = dt_datas[dt_num : dt_num + dt_nums[i]]
             ignored_gt = ignored_gts[gt_num : gt_num + gt_nums[i]]
             ignored_det = ignored_dets[dt_num : dt_num + dt_nums[i]]
-            dontcare = dontcares[dc_num : dc_num + dc_nums[i]]
             tp, fp, fn, similarity, _ = compute_statistics_jit(
                 overlap,
                 gt_data,
                 dt_data,
                 ignored_gt,
                 ignored_det,
-                dontcare,
-                metric,
                 min_overlap=min_overlap,
                 thresh=thresh,
                 compute_fp=True,
@@ -418,13 +314,11 @@ def fused_compute_statistics(
                 pr[t, 3] += similarity
         gt_num += gt_nums[i]
         dt_num += dt_nums[i]
-        dc_num += dc_nums[i]
 
 
 def calculate_iou_partly(
     gt_annos: list[dict[str, Any]],
     dt_annos: list[dict[str, Any]],
-    metric: int,
     num_parts: int = 50,
 ) -> tuple[list[np.ndarray], list[np.ndarray], np.ndarray, np.ndarray]:
     """Fast iou algorithm.
@@ -435,7 +329,6 @@ def calculate_iou_partly(
     Args:
         gt_annos: List of dict, must from get_label_annos() in kitti_common.py
         dt_annos: List of dict, must from get_label_annos() in kitti_common.py
-        metric: Eval type. 0: bbox, 1: bev, 2: 3d
         num_parts: Int, a parameter for fast calculate algorithm
 
     Returns:
@@ -478,23 +371,17 @@ def d3_box_overlap(boxes: np.ndarray, qboxes: np.ndarray, criterion: int = -1) -
     for num_part in split_parts:
         gt_annos_part = gt_annos[example_idx : example_idx + num_part]
         dt_annos_part = dt_annos[example_idx : example_idx + num_part]
-        if metric == 0:
-            gt_boxes = np.concatenate([a["bbox"] for a in gt_annos_part], 0)
-            dt_boxes = np.concatenate([a["bbox"] for a in dt_annos_part], 0)
-            overlap_part = image_box_overlap(gt_boxes, dt_boxes)
-        elif metric == 2:
-            loc = np.concatenate([a["location"] for a in gt_annos_part], 0)
-            dims = np.concatenate([a["dimensions"] for a in gt_annos_part], 0)
-            rots = np.concatenate([a["rotation_y"] for a in gt_annos_part], 0)
-            gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1)
-            loc = np.concatenate([a["location"] for a in dt_annos_part], 0)
-            dims = np.concatenate([a["dimensions"] for a in dt_annos_part], 0)
-            rots = np.concatenate([a["rotation_y"] for a in dt_annos_part], 0)
-            dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1)
-            overlap_part = d3_box_overlap(gt_boxes, dt_boxes).astype(np.float64)
-        else:
-            msg = "unknown metric"
-            raise ValueError(msg)
+
+        loc = np.concatenate([a["location"] for a in gt_annos_part], 0)
+        dims = np.concatenate([a["dimensions"] for a in gt_annos_part], 0)
+        rots = np.concatenate([a["rotation_y"] for a in gt_annos_part], 0)
+        gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1)
+        loc = np.concatenate([a["location"] for a in dt_annos_part], 0)
+        dims = np.concatenate([a["dimensions"] for a in dt_annos_part], 0)
+        rots = np.concatenate([a["rotation_y"] for a in dt_annos_part], 0)
+        dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1)
+        overlap_part = d3_box_overlap(gt_boxes, dt_boxes).astype(np.float64)
+
         parted_overlaps.append(overlap_part)
         example_idx += num_part
     overlaps = []
@@ -520,38 +407,30 @@ def _prepare_data(
     gt_annos: list[dict[str, Any]],
     dt_annos: list[dict[str, Any]],
     current_class: str,
-    difficulty: int,
-) -> tuple[list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray], np.ndarray, int]:
+) -> tuple[list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray], int]:
     """Prepare data for evaluation.
 
     Args:
         gt_annos (List[Dict[str, Any]]): Ground truth annotations.
         dt_annos (List[Dict[str, Any]]): Detection annotations.
         current_class (str): Current class name.
-        difficulty (int): Difficulty level.
 
     Returns:
         Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray],
-        List[np.ndarray], List[np.ndarray], np.ndarray, int]:
+        List[np.ndarray], int]:
             gt_datas_list, dt_datas_list, ignored_gts, ignored_dets,
-            dontcares, total_dc_num, total_num_valid_gt
+            total_num_valid_gt
     """
     gt_datas_list = []
     dt_datas_list = []
-    total_dc_num = []
-    ignored_gts, ignored_dets, dontcares = [], [], []
+    ignored_gts, ignored_dets = [], []
     total_num_valid_gt = 0
     for i in range(len(gt_annos)):
-        rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty)
-        num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets
+        rets = clean_data(gt_annos[i], dt_annos[i], current_class)
+        num_valid_gt, ignored_gt, ignored_det = rets
         ignored_gts.append(np.array(ignored_gt, dtype=np.int64))
         ignored_dets.append(np.array(ignored_det, dtype=np.int64))
-        if len(dc_bboxes) == 0:
-            dc_bboxes = np.zeros((0, 4)).astype(np.float64)
-        else:
-            dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64)
-        total_dc_num.append(dc_bboxes.shape[0])
-        dontcares.append(dc_bboxes)
+
         total_num_valid_gt += num_valid_gt
         gt_datas = np.concatenate([gt_annos[i]["bbox"], gt_annos[i]["alpha"][..., np.newaxis]], 1)
         dt_datas = np.concatenate(
@@ -564,112 +443,94 @@ def _prepare_data(
         )
         gt_datas_list.append(gt_datas)
         dt_datas_list.append(dt_datas)
-    total_dc_num = np.stack(total_dc_num, axis=0)
-    return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, total_dc_num, total_num_valid_gt)
+
+    return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, total_num_valid_gt)
 
 
 def eval_class(
     gt_annos: list[dict[str, Any]],
     dt_annos: list[dict[str, Any]],
     current_classes: list[str],
-    difficultys: list[int],
-    metric: int,
     min_overlaps: np.ndarray,
     num_parts: int = 50,
+    num_samples_pts: int = 41,
 ) -> dict[str, np.ndarray]:
     """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP.
 
     Args:
-        gt_annos: dict, must from get_label_annos() in kitti_common.py
-        dt_annos: dict, must from get_label_annos() in kitti_common.py
-        current_classes: list of label names
-        difficultys: list of int. eval difficulty, 0: easy, 1: normal, 2: hard
-        metric: eval type. 0: bbox, 1: bev, 2: 3d
-        min_overlaps: float, min overlap. format: [num_overlap, metric, class].
-        num_parts: int. a parameter for fast calculate algorithm
+        gt_annos (dict): must from get_label_annos() in kitti_common.py
+        dt_annos (dict): must from get_label_annos() in kitti_common.py
+        current_classes (list): label names
+        min_overlaps (float): min overlap. format: [num_overlap, class].
+        num_parts (int): a parameter for fast calculate algorithm
+        num_samples_pts (int): number of points for precision-recall curve
 
     Returns:
-        dict of recall, precision and aos
+        dict of recall, precision
     """
     num_examples = len(gt_annos)
     split_parts = get_split_parts(num_examples, num_parts)
 
-    part_calculated = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts)
-    overlaps, parted_overlaps, total_dt_num, total_gt_num = part_calculated
-    num_samples_pts = 41  # TODO(Kirill): why it is 41?
-    # The validation with 1-40 examples are not possible corecctly
+    rets = calculate_iou_partly(dt_annos, gt_annos, num_parts)
+    overlaps, parted_overlaps, total_dt_num, total_gt_num = rets
     num_minoverlap = len(min_overlaps)
     num_class = len(current_classes)
-    num_difficulty = len(difficultys)
-    precision = np.zeros([num_class, num_difficulty, num_minoverlap, num_samples_pts])
-    recall = np.zeros([num_class, num_difficulty, num_minoverlap, num_samples_pts])
-    aos = np.zeros([num_class, num_difficulty, num_minoverlap, num_samples_pts])
+    precision = np.zeros([num_class, num_minoverlap, num_samples_pts])
+    recall = np.zeros([num_class, num_minoverlap, num_samples_pts])
     for m, current_class in enumerate(current_classes):
-        for d, difficulty in enumerate(difficultys):
-            (
-                gt_datas_list,
-                dt_datas_list,
-                ignored_gts,
-                ignored_dets,
-                dontcares,
-                total_dc_num,
-                total_num_valid_gt,
-            ) = _prepare_data(gt_annos, dt_annos, current_class, difficulty)
-            for k, min_overlap in enumerate(min_overlaps[:, metric, m]):
-                thresholdss = []
-                for i in range(len(gt_annos)):
-                    tp, fp, fn, similarity, thresholds = compute_statistics_jit(
-                        overlaps[i],
-                        gt_datas_list[i],
-                        dt_datas_list[i],
-                        ignored_gts[i],
-                        ignored_dets[i],
-                        dontcares[i],
-                        metric,
-                        min_overlap=min_overlap,
-                        thresh=0.0,
-                        compute_fp=False,
-                    )
-                    thresholdss += thresholds.tolist()
-                thresholdss = np.array(thresholdss)
-                thresholds = get_thresholds(thresholdss, total_num_valid_gt)
-                thresholds = np.array(thresholds)
-                pr = np.zeros([len(thresholds), 4])
-                idx = 0
-                for j, num_part in enumerate(split_parts):
-                    gt_datas_part = np.concatenate(gt_datas_list[idx : idx + num_part], 0)
-                    dt_datas_part = np.concatenate(dt_datas_list[idx : idx + num_part], 0)
-                    dc_datas_part = np.concatenate(dontcares[idx : idx + num_part], 0)
-                    ignored_dets_part = np.concatenate(ignored_dets[idx : idx + num_part], 0)
-                    ignored_gts_part = np.concatenate(ignored_gts[idx : idx + num_part], 0)
-                    fused_compute_statistics(
-                        parted_overlaps[j],
-                        pr,
-                        total_gt_num[idx : idx + num_part],
-                        total_dt_num[idx : idx + num_part],
-                        total_dc_num[idx : idx + num_part],
-                        gt_datas_part,
-                        dt_datas_part,
-                        dc_datas_part,
-                        ignored_gts_part,
-                        ignored_dets_part,
-                        metric,
-                        min_overlap=min_overlap,
-                        thresholds=thresholds,
-                    )
-                    idx += num_part
-                for i in range(len(thresholds)):
-                    recall[m, d, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2])
-                    precision[m, d, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 1])
-
-                for i in range(len(thresholds)):
-                    precision[m, d, k, i] = np.max(precision[m, d, k, i:], axis=-1)
-                    recall[m, d, k, i] = np.max(recall[m, d, k, i:], axis=-1)
+        (
+            gt_datas_list,
+            dt_datas_list,
+            ignored_gts,
+            ignored_dets,
+            total_num_valid_gt,
+        ) = _prepare_data(gt_annos, dt_annos, current_class)
+        for k, min_overlap in enumerate(min_overlaps[:, m]):
+            thresholdss = []
+            for i in range(len(gt_annos)):
+                tp, fp, fn, similarity, thresholds = compute_statistics_jit(
+                    overlaps[i],
+                    gt_datas_list[i],
+                    dt_datas_list[i],
+                    ignored_gts[i],
+                    ignored_dets[i],
+                    min_overlap=min_overlap,
+                    thresh=0.0,
+                    compute_fp=False,
+                )
+                thresholdss += thresholds.tolist()
+            if not thresholdss:
+                continue  # no tp -> 0 precision and recall
+            # create thresholds between 0 and the max threshold, len(thresholds) == num_samples_pts
+            thresholds = np.linspace(0.0, np.max(thresholdss), num_samples_pts)
+            pr = np.zeros([len(thresholds), 4])
+            idx = 0
+            for j, num_part in enumerate(split_parts):
+                gt_datas_part = np.concatenate(gt_datas_list[idx : idx + num_part], 0)
+                dt_datas_part = np.concatenate(dt_datas_list[idx : idx + num_part], 0)
+                ignored_dets_part = np.concatenate(ignored_dets[idx : idx + num_part], 0)
+                ignored_gts_part = np.concatenate(ignored_gts[idx : idx + num_part], 0)
+                fused_compute_statistics(
+                    parted_overlaps[j],
+                    pr,
+                    total_gt_num[idx : idx + num_part],
+                    total_dt_num[idx : idx + num_part],
+                    gt_datas_part,
+                    dt_datas_part,
+                    ignored_gts_part,
+                    ignored_dets_part,
+                    min_overlap=min_overlap,
+                    thresholds=thresholds,
+                )
+                idx += num_part
+
+            for i in range(len(thresholds)):
+                recall[m, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2])
+                precision[m, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 1])
 
     return {
         "recall": recall,
         "precision": precision,
-        "orientation": aos,
     }
 
 
@@ -678,7 +539,7 @@ def do_eval_cut_version(
     dt_annos: list[dict[str, Any]],
     current_classes: list[str],
     min_overlaps: np.ndarray,
-) -> tuple[np.ndarray, np.ndarray]:
+) -> np.ndarray:
     """Evaluates detections with COCO style AP.
 
     Args:
@@ -688,34 +549,19 @@ def do_eval_cut_version(
         min_overlaps (np.ndarray): Overlap ranges.
 
     Returns:
-        Tuple[float, float]: Bounding box and 3D bounding box AP.
+        np.ndarray: 3D bounding box AP.
     """
-
-    def _get_map(prec: np.ndarray) -> np.ndarray:
-        sums = 0
-        for i in range(0, prec.shape[-1], 4):
-            sums = sums + prec[..., i]
-        return sums / 11 * 100
-
-    # min_overlaps: [num_minoverlap, metric, num_class]
-    difficultys = [0, 1, 2]
-    ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 0, min_overlaps)
-    # ret: [num_class, num_diff, num_minoverlap, num_sample_points]
-    # get 2d bbox map
-    map_bbox = _get_map(ret["precision"])
-
-    # get 3d bbox map
-    ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2, min_overlaps)
-    map_3d = _get_map(ret["precision"])
-
-    return map_bbox, map_3d
+    # min_overlaps: [num_minoverlap, num_class]
+    # get 3D bbox mAP
+    ret = eval_class(gt_annos, dt_annos, current_classes, min_overlaps)
+    return np.mean(ret["precision"], axis=2)
 
 
 def get_coco_eval_result(
     gt_annos: list[dict],
     dt_annos: list[dict],
     current_classes: list[str],
-) -> tuple[np.ndarray, np.ndarray]:
+) -> np.ndarray:
     """Evaluates detections with COCO style AP.
 
     Args:
@@ -724,7 +570,7 @@ def get_coco_eval_result(
         current_classes (list[str]): Classes to evaluate.
 
     Returns:
-        Tuple[np.ndarray, np.ndarray]: Bounding box and 3D bounding box AP.
+        np.ndarray: 3D bounding box AP.
     """
 
     def do_coco_style_eval(
@@ -732,7 +578,7 @@ def do_coco_style_eval(
         dt_annos: list[dict],
         current_classes: list[str],
         overlap_ranges: np.ndarray,
-    ) -> tuple[np.ndarray, np.ndarray]:
+    ) -> np.ndarray:
         """Evaluates detections with COCO style AP.
 
         Args:
@@ -742,39 +588,33 @@ def do_coco_style_eval(
             overlap_ranges (np.ndarray): Overlap ranges.
 
         Returns:
-            Tuple[np.ndarray, np.ndarray]: Bounding box and 3D bounding box AP.
+            np.ndarray: 3D bounding box AP.
         """
         min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]])
 
         for i in range(overlap_ranges.shape[1]):
-            for j in range(overlap_ranges.shape[2]):
-                min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j][:2], 10)
+            min_overlaps[:, i] = np.linspace(*overlap_ranges[:, i], 10)
+
+        map_3d = do_eval_cut_version(gt_annos, dt_annos, current_classes, min_overlaps)
 
-        map_bbox, map_3d = do_eval_cut_version(gt_annos, dt_annos, current_classes, min_overlaps)
+        result_str = ""
 
-        return map_bbox.mean(-1), map_3d.mean(-1)
+        for i, lbl in enumerate(current_classes):
+            result_str += f"\nclass: {lbl}\n" + "-" * len(f"class: {lbl}") + "\n"
+            for j, overlap in enumerate(min_overlaps):
+                result_str += f"AP@IoU={np.round(overlap[i],2)}: {np.round(map_3d[i][j] * 100, 2)}\n"
+            result_str += "\n"
+        logging.log(msg=result_str, level=logging.INFO)
 
-    iou_range = [0.5, 0.95, 10]
+        return map_3d.mean(0)
+
+    iou_range = [0.5, 0.95]
     if not isinstance(current_classes, (list, tuple)):
         current_classes = [current_classes]
 
-    overlap_ranges = np.zeros([3, 3, len(current_classes)])
+    overlap_ranges = np.zeros([2, len(current_classes)])
     for i in range(len(current_classes)):
         # iou from 0.5 to 0.95
-        overlap_ranges[:, :, i] = np.array(iou_range)[:, np.newaxis]
-    result = ""
-    # check whether alpha is valid
-    map_bbox, map_3d = do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges)
-
-    for j, curcls in enumerate(current_classes):
-        # map threshold array: [num_minoverlap, metric, class]
-        # map result: [num_class, num_diff, num_minoverlap]
-        o_range = np.array(iou_range)[[0, 2, 1]]
-        o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1)
-        result += f"{curcls} " + "coco AP@{:.2f}:{:.2f}:{:.2f}:\n".format(*o_range)
-        result += f"bbox AP:{map_bbox[j, 0]:.2f}, {map_bbox[j, 1]:.2f}, {map_bbox[j, 2]:.2f}\n"
-        result += f"3d   AP:{map_3d[j, 0]:.2f}, {map_3d[j, 1]:.2f}, {map_3d[j, 2]:.2f}\n"
-
-    print("\n COCO style evaluation results: \n", result)
-
-    return map_bbox, map_3d
+        overlap_ranges[:, i] = np.array(iou_range)
+
+    return do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges)
diff --git a/src/otx/core/model/detection_3d.py b/src/otx/core/model/detection_3d.py
index fce245e79ac..9c6cacda6c5 100644
--- a/src/otx/core/model/detection_3d.py
+++ b/src/otx/core/model/detection_3d.py
@@ -80,6 +80,73 @@ def _export_parameters(self) -> TaskLevelExportParameters:
             task_type="3d_detection",
         )
 
+    def _customize_inputs(
+        self,
+        entity: Det3DBatchDataEntity,
+    ) -> dict[str, Any]:
+        # prepare bboxes for the model
+        targets_list = []
+        img_sizes = torch.from_numpy(np.array([img_info.ori_shape for img_info in entity.imgs_info])).to(
+            device=entity.images.device,
+        )
+        key_list = ["labels", "boxes", "depth", "size_3d", "heading_angle", "boxes_3d"]
+        for bz in range(len(entity.imgs_info)):
+            target_dict = {}
+            for key in key_list:
+                target_dict[key] = getattr(entity, key)[bz]
+            targets_list.append(target_dict)
+
+        return {
+            "images": entity.images,
+            "calibs": torch.cat([p2.unsqueeze(0) for p2 in entity.calib_matrix], dim=0),
+            "targets": targets_list,
+            "img_sizes": img_sizes,
+            "mode": "loss" if self.training else "predict",
+        }
+
+    def _customize_outputs(
+        self,
+        outputs: dict[str, torch.Tensor],
+        inputs: Det3DBatchDataEntity,
+    ) -> Det3DBatchPredEntity | OTXBatchLossEntity:
+        if self.training:
+            if not isinstance(outputs, dict):
+                raise TypeError(outputs)
+
+            losses = OTXBatchLossEntity()
+            for k, v in outputs.items():
+                if isinstance(v, list):
+                    losses[k] = sum(v)
+                elif isinstance(v, torch.Tensor):
+                    losses[k] = v
+                else:
+                    msg = "Loss output should be list or torch.tensor but got {type(v)}"
+                    raise TypeError(msg)
+            return losses
+
+        labels, scores, size_3d, heading_angle, boxes_3d, depth = self.extract_dets_from_outputs(outputs)
+        # bbox 2d decoding
+        boxes_2d = box_cxcylrtb_to_xyxy(boxes_3d)
+        xywh_2d = box_convert(boxes_2d, "xyxy", "cxcywh")
+        # size 2d decoding
+        size_2d = xywh_2d[:, :, 2:4]
+
+        return Det3DBatchPredEntity(
+            batch_size=inputs.batch_size,
+            images=inputs.images,
+            imgs_info=inputs.imgs_info,
+            calib_matrix=inputs.calib_matrix,
+            boxes=boxes_2d,
+            labels=labels,
+            boxes_3d=boxes_3d,
+            size_2d=size_2d,
+            size_3d=size_3d,
+            depth=depth,
+            heading_angle=heading_angle,
+            scores=scores,
+            original_kitti_format=[None],
+        )
+
     def _convert_pred_entity_to_compute_metric(
         self,
         preds: Det3DBatchPredEntity,
diff --git a/src/otx/recipe/_base_/data/object_detection_3d.yaml b/src/otx/recipe/_base_/data/object_detection_3d.yaml
index a7c773f1bcf..708c73b6750 100644
--- a/src/otx/recipe/_base_/data/object_detection_3d.yaml
+++ b/src/otx/recipe/_base_/data/object_detection_3d.yaml
@@ -12,7 +12,7 @@ train_subset:
   subset_name: train
   transform_lib_type: TORCHVISION
   batch_size: 8
-  num_workers: 4
+  num_workers: 2
   to_tv_image: false
   transforms:
     - class_path: torchvision.transforms.v2.Normalize
@@ -27,7 +27,7 @@ val_subset:
   subset_name: val
   transform_lib_type: TORCHVISION
   batch_size: 16
-  num_workers: 4
+  num_workers: 2
   to_tv_image: false
   transforms:
     - class_path: torchvision.transforms.v2.Normalize
@@ -41,7 +41,7 @@ test_subset:
   subset_name: test
   transform_lib_type: TORCHVISION
   batch_size: 16
-  num_workers: 4
+  num_workers: 2
   to_tv_image: false
   transforms:
     - class_path: torchvision.transforms.v2.Normalize
diff --git a/src/otx/recipe/object_detection_3d/monodetr3d.yaml b/src/otx/recipe/object_detection_3d/monodetr3d.yaml
index 032c71ffbf8..ec5aaa005eb 100644
--- a/src/otx/recipe/object_detection_3d/monodetr3d.yaml
+++ b/src/otx/recipe/object_detection_3d/monodetr3d.yaml
@@ -20,13 +20,13 @@ model:
         mode: max
         factor: 0.1
         patience: 13
-        monitor: val/mAP_bbox_2d
+        monitor: val/AP_2d@0.5
 
 engine:
   task: OBJECT_DETECTION_3D
   device: auto
 
-callback_monitor: val/mAP_bbox_3d
+callback_monitor: val/AP_3d@0.5
 
 data: ../_base_/data/object_detection_3d.yaml
 
diff --git a/tests/integration/cli/test_export_inference.py b/tests/integration/cli/test_export_inference.py
index 2e556210165..1d455616c4f 100644
--- a/tests/integration/cli/test_export_inference.py
+++ b/tests/integration/cli/test_export_inference.py
@@ -49,7 +49,7 @@ def fxt_local_seed() -> int:
     "zero_shot_visual_prompting": "test/f1-score",
     "action_classification": "test/accuracy",
     "keypoint_detection": "test/PCK",
-    "object_detection_3d": "test/mAP_bbox_3d",
+    "object_detection_3d": "test/AP_3d@0.5",
 }
 
 
diff --git a/tests/perf/test_object_detection_3d.py b/tests/perf/test_object_detection_3d.py
index 2cf247843c9..2fae45c8221 100644
--- a/tests/perf/test_object_detection_3d.py
+++ b/tests/perf/test_object_detection_3d.py
@@ -40,14 +40,14 @@ class TestPerfObjectDetection3D(PerfTestBase):
     BENCHMARK_CRITERIA = [  # noqa: RUF012
         Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1),
         Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1),
-        Benchmark.Criterion(name="val/mAP_bbox_3d", summary="max", compare=">", margin=0.05),
-        Benchmark.Criterion(name="val/mAP_bbox_2d", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="test/mAP_bbox_3d", summary="max", compare=">", margin=0.05),
-        Benchmark.Criterion(name="test/mAP_bbox_2d", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="export/mAP_bbox_3d", summary="max", compare=">", margin=0.05),
-        Benchmark.Criterion(name="export/mAP_bbox_2d", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="optimize/mAP_bbox_3d", summary="max", compare=">", margin=0.05),
-        Benchmark.Criterion(name="optimize/mAP_bbox_2d", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="val/AP_3d@0.5", summary="max", compare=">", margin=0.05),
+        Benchmark.Criterion(name="val/AP_2d@0.5", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test/AP_3d@0.5", summary="max", compare=">", margin=0.05),
+        Benchmark.Criterion(name="test/AP_2d@0.5", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="export/AP_3d@0.5", summary="max", compare=">", margin=0.05),
+        Benchmark.Criterion(name="export/AP_2d@0.5", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="optimize/AP_3d@0.5", summary="max", compare=">", margin=0.05),
+        Benchmark.Criterion(name="optimize/AP_2d@0.5", summary="max", compare=">", margin=0.1),
         Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),