diff --git a/fiftyone/utils/cvat.py b/fiftyone/utils/cvat.py
index 335684ee97..21fde1167c 100644
--- a/fiftyone/utils/cvat.py
+++ b/fiftyone/utils/cvat.py
@@ -1587,6 +1587,31 @@ def from_image_dict(cls, d):
         )
 
 
+class HasCVATBinaryMask(object):
+    """Mixin for CVAT annotations that store RLE format instance masks."""
+
+    @staticmethod
+    def _rle_to_binary_image_mask(rle, mask_width, mask_height):
+        mask = np.zeros(mask_width * mask_height, dtype=np.uint8)
+        counter = 0
+        for i, val in enumerate(rle):
+            if i % 2 == 1:
+                mask[counter : counter + val] = 1
+            counter += val
+        return mask.reshape(mask_height, mask_width)
+
+    @staticmethod
+    def _mask_to_cvat_rle(binary_mask):
+        counts = []
+        for i, (value, elements) in enumerate(
+            itertools.groupby(binary_mask.ravel(order="C"))
+        ):
+            if i == 0 and value == 1:
+                counts.append(0)
+            counts.append(len(list(elements)))
+        return counts
+
+
 class HasCVATPoints(object):
     """Mixin for CVAT annotations that store a list of ``(x, y)`` pixel
     coordinates.
@@ -5919,6 +5944,9 @@ def _parse_annotation(
             if shape_type == "rectangle":
                 label_type = "detections"
                 label = cvat_shape.to_detection()
+            elif shape_type == "mask":
+                label_type = "detections"
+                label = cvat_shape.to_instance()
             elif shape_type == "polygon":
                 if expected_label_type == "segmentation":
                     # A piece of a segmentation mask
@@ -6430,24 +6458,23 @@ def _create_detection_shapes(
                 if det.has_mask is None:
                     continue
 
-                polygon = det.to_polyline()
-                for points in polygon.points:
-                    if len(points) < 3:
-                        continue  # CVAT polygons must contain >= 3 points
+                if self._server_version >= Version("2.3"):
+                    x, y, _, _ = det.bounding_box
+                    frame_width, frame_height = frame_size
+                    mask_height, mask_width = det.mask.shape
+                    xtl, ytl = round(x * frame_width), round(y * frame_height)
+                    xbr, ybr = xtl + mask_width, ytl + mask_height
 
-                    abs_points = HasCVATPoints._to_abs_points(
-                        points, frame_size
-                    )
-                    flattened_points = list(
-                        itertools.chain.from_iterable(abs_points)
-                    )
+                    # -1 to convert from CVAT indexing
+                    rle = HasCVATBinaryMask._mask_to_cvat_rle(det.mask)
+                    rle.extend([xtl, ytl, xbr - 1, ybr - 1])
 
                     curr_shapes.append(
                         {
-                            "type": "polygon",
+                            "type": "mask",
                             "occluded": is_occluded,
                             "z_order": 0,
-                            "points": flattened_points,
+                            "points": rle,
                             "label_id": class_name,
                             "group": group_id,
                             "frame": frame_id,
@@ -6455,6 +6482,32 @@ def _create_detection_shapes(
                             "attributes": deepcopy(attributes),
                         }
                     )
+                else:
+                    polygon = det.to_polyline()
+                    for points in polygon.points:
+                        if len(points) < 3:
+                            continue  # CVAT polygons must contain >= 3 points
+
+                        abs_points = HasCVATPoints._to_abs_points(
+                            points, frame_size
+                        )
+                        flattened_points = list(
+                            itertools.chain.from_iterable(abs_points)
+                        )
+
+                        curr_shapes.append(
+                            {
+                                "type": "polygon",
+                                "occluded": is_occluded,
+                                "z_order": 0,
+                                "points": flattened_points,
+                                "label_id": class_name,
+                                "group": group_id,
+                                "frame": frame_id,
+                                "source": "manual",
+                                "attributes": deepcopy(attributes),
+                            }
+                        )
 
             if not curr_shapes:
                 continue
@@ -7103,6 +7156,38 @@ def to_detection(self):
         self._set_attributes(label)
         return label
 
+    def to_instance(self):
+        """Converts this shape to a :class:`fiftyone.core.labels.Detection`
+        with instance mask.
+
+        Returns:
+            a :class:`fiftyone.core.labels.Detection`
+        """
+        xtl, ytl, xbr, ybr = self.points[-4:]
+        rel = np.array(self.points[:-4], dtype=int)
+        frame_width, frame_height = self.frame_size
+
+        # +1 to convert from CVAT indexing
+        mask_w, mask_h = round(xbr - xtl) + 1, round(ybr - ytl) + 1
+        mask = HasCVATBinaryMask._rle_to_binary_image_mask(
+            rel, mask_height=mask_h, mask_width=mask_w
+        )
+
+        bbox = [
+            xtl / frame_width,
+            ytl / frame_height,
+            (xbr - xtl) / frame_width,
+            (ybr - ytl) / frame_height,
+        ]
+        label = fol.Detection(
+            label=self.label,
+            bounding_box=bbox,
+            index=self.index,
+            mask=mask,
+        )
+        self._set_attributes(label)
+        return label
+
     def to_polyline(self, closed=False, filled=False):
         """Converts this shape to a :class:`fiftyone.core.labels.Polyline`.