From 302cf52855c0b7537cb593d55718b4d7e164154a Mon Sep 17 00:00:00 2001
From: brimoor <brimoor@umich.edu>
Date: Thu, 26 Dec 2024 17:42:47 -0600
Subject: [PATCH] mask type is only supported by CVAT>=2.3

---
 fiftyone/utils/cvat.py | 102 ++++++++++++++++++++++++++---------------
 1 file changed, 65 insertions(+), 37 deletions(-)

diff --git a/fiftyone/utils/cvat.py b/fiftyone/utils/cvat.py
index edc372e28f..21fde1167c 100644
--- a/fiftyone/utils/cvat.py
+++ b/fiftyone/utils/cvat.py
@@ -1587,21 +1587,21 @@ def from_image_dict(cls, d):
         )
 
 
-class HasCVATBinMask:
+class HasCVATBinaryMask(object):
+    """Mixin for CVAT annotations that store RLE format instance masks."""
+
     @staticmethod
-    def rle_to_binary_image_mask(rle, mask_width, mask_height) -> np.ndarray:
+    def _rle_to_binary_image_mask(rle, mask_width, mask_height):
         mask = np.zeros(mask_width * mask_height, dtype=np.uint8)
         counter = 0
-
         for i, val in enumerate(rle):
             if i % 2 == 1:
                 mask[counter : counter + val] = 1
             counter += val
-
         return mask.reshape(mask_height, mask_width)
 
     @staticmethod
-    def mask_to_cvat_rle(binary_mask: np.ndarray) -> np.array:
+    def _mask_to_cvat_rle(binary_mask):
         counts = []
         for i, (value, elements) in enumerate(
             itertools.groupby(binary_mask.ravel(order="C"))
@@ -5946,7 +5946,7 @@ def _parse_annotation(
                 label = cvat_shape.to_detection()
             elif shape_type == "mask":
                 label_type = "detections"
-                label = cvat_shape.to_instance_detection()
+                label = cvat_shape.to_instance()
             elif shape_type == "polygon":
                 if expected_label_type == "segmentation":
                     # A piece of a segmentation mask
@@ -6457,29 +6457,57 @@ def _create_detection_shapes(
             elif label_type in ("instance", "instances"):
                 if det.has_mask is None:
                     continue
-                x, y, _, _ = det.bounding_box
-                frame_width, frame_height = frame_size
-                mask_height, mask_width = det.mask.shape
-                xtl, ytl = round(x * frame_width), round(y * frame_height)
-                xbr, ybr = xtl + mask_width, ytl + mask_height
-
-                rle = HasCVATBinMask.mask_to_cvat_rle(det.mask)
-                rle.extend(  # Necessary as per CVAT API
-                    [xtl, ytl, xbr - 1, ybr - 1]
-                )
-                curr_shapes.append(
-                    {
-                        "type": "mask",
-                        "occluded": is_occluded,
-                        "z_order": 0,
-                        "points": rle,
-                        "label_id": class_name,
-                        "group": group_id,
-                        "frame": frame_id,
-                        "source": "manual",
-                        "attributes": deepcopy(attributes),
-                    }
-                )
+
+                if self._server_version >= Version("2.3"):
+                    x, y, _, _ = det.bounding_box
+                    frame_width, frame_height = frame_size
+                    mask_height, mask_width = det.mask.shape
+                    xtl, ytl = round(x * frame_width), round(y * frame_height)
+                    xbr, ybr = xtl + mask_width, ytl + mask_height
+
+                    # -1 to convert from CVAT indexing
+                    rle = HasCVATBinaryMask._mask_to_cvat_rle(det.mask)
+                    rle.extend([xtl, ytl, xbr - 1, ybr - 1])
+
+                    curr_shapes.append(
+                        {
+                            "type": "mask",
+                            "occluded": is_occluded,
+                            "z_order": 0,
+                            "points": rle,
+                            "label_id": class_name,
+                            "group": group_id,
+                            "frame": frame_id,
+                            "source": "manual",
+                            "attributes": deepcopy(attributes),
+                        }
+                    )
+                else:
+                    polygon = det.to_polyline()
+                    for points in polygon.points:
+                        if len(points) < 3:
+                            continue  # CVAT polygons must contain >= 3 points
+
+                        abs_points = HasCVATPoints._to_abs_points(
+                            points, frame_size
+                        )
+                        flattened_points = list(
+                            itertools.chain.from_iterable(abs_points)
+                        )
+
+                        curr_shapes.append(
+                            {
+                                "type": "polygon",
+                                "occluded": is_occluded,
+                                "z_order": 0,
+                                "points": flattened_points,
+                                "label_id": class_name,
+                                "group": group_id,
+                                "frame": frame_id,
+                                "source": "manual",
+                                "attributes": deepcopy(attributes),
+                            }
+                        )
 
             if not curr_shapes:
                 continue
@@ -7128,9 +7156,9 @@ def to_detection(self):
         self._set_attributes(label)
         return label
 
-    def to_instance_detection(self):
-        """Converts this shape to a :class:`fiftyone.core.labels.Detection`.
-        Special case where we also have a mask
+    def to_instance(self):
+        """Converts this shape to a :class:`fiftyone.core.labels.Detection`
+        with instance mask.
 
         Returns:
             a :class:`fiftyone.core.labels.Detection`
@@ -7138,13 +7166,13 @@ def to_instance_detection(self):
         xtl, ytl, xbr, ybr = self.points[-4:]
         rel = np.array(self.points[:-4], dtype=int)
         frame_width, frame_height = self.frame_size
-        mask_w, mask_h = (
-            round(xbr - xtl) + 1,
-            round(ybr - ytl) + 1,
-        )  # We need to add 1 because cvat uses - 1
-        mask = HasCVATBinMask.rle_to_binary_image_mask(
+
+        # +1 to convert from CVAT indexing
+        mask_w, mask_h = round(xbr - xtl) + 1, round(ybr - ytl) + 1
+        mask = HasCVATBinaryMask._rle_to_binary_image_mask(
             rel, mask_height=mask_h, mask_width=mask_w
         )
+
         bbox = [
             xtl / frame_width,
             ytl / frame_height,