From b2e15b1dd0800c897b909f50a212f1c2378c802c Mon Sep 17 00:00:00 2001 From: "charles.ochoa" Date: Thu, 25 Aug 2022 02:23:00 +0000 Subject: [PATCH] feat: add camera datum augmentations --- dgp/annotations/camera_transforms.py | 867 +++++++++++++++++++++++++++ tests/test_camera_transforms.py | 329 ++++++++++ 2 files changed, 1196 insertions(+) create mode 100644 dgp/annotations/camera_transforms.py create mode 100644 tests/test_camera_transforms.py diff --git a/dgp/annotations/camera_transforms.py b/dgp/annotations/camera_transforms.py new file mode 100644 index 00000000..fa3d884a --- /dev/null +++ b/dgp/annotations/camera_transforms.py @@ -0,0 +1,867 @@ +# Transformations for camera datums in DGP Synchronized Scene Format +# Copyright 2021-2022 Woven Planet. All rights reserved. +import logging +from copy import deepcopy +from typing import Any, Dict, List, Optional, Tuple, Union + +import cv2 +import numpy as np +import PIL +from PIL.ImageTransform import AffineTransform + +from dgp.annotations.bounding_box_2d_annotation import ( + BoundingBox2DAnnotationList, +) +from dgp.annotations.bounding_box_3d_annotation import ( + BoundingBox3DAnnotationList, +) +from dgp.annotations.depth_annotation import DenseDepthAnnotation +from dgp.annotations.key_point_2d_annotation import KeyPoint2DAnnotationList +from dgp.annotations.panoptic_segmentation_2d_annotation import ( + PanopticSegmentation2DAnnotation, +) +from dgp.annotations.semantic_segmentation_2d_annotation import ( + SemanticSegmentation2DAnnotation, +) +from dgp.annotations.transforms import BaseTransform +from dgp.utils.pose import Pose +from dgp.utils.structures.bounding_box_2d import BoundingBox2D + +logger = logging.getLogger(__name__) + +# Some opencv operations can lead to deadlocks when using multiprocess.fork instead of spawn +# we disable opencv multithreading to be safe +cv2.setNumThreads(0) + + +def calc_affine_transform( + theta: float, + scale: float, + flip: bool, + shiftx: float, + shifty: float, + shear: float, + img_shape: Union[Tuple[int, int], Tuple[int, int, int]], +) -> np.ndarray: + """Generates a matrix corresponding to an affine transform for the given inputs. + + Parameters + ---------- + + theta: float + Rotation angle in degrees. + scale: float + Scale factor such as .5 of half size or 2.0 for double size. + flip: bool + If true, perform a left right flip. + shiftx: float + Amount in pixels to shift horizontally. + shifty: float + Amount in pixels to shift vertically. + shear: float + Scale factor for image shear. + img_shape: tuple + Tuple corresponding to img shape ie (h,w,3) or (h,w). + + Returns + ------- + + A: np.ndarray + 3x3 matrix that expresses the requested transformations. + """ + h, w = img_shape[:2] + + # Rotate and scale + # TODO: break scale into scale_y and scale_x? + R = cv2.getRotationMatrix2D((w / 2, h / 2), theta, scale) + R = np.vstack([R, np.array([0, 0, 1.0])]) + + # Shift and shear + if shear != 0: + logger.warning('Shear was set to non zero, shear is not well supported by many downstream operations') + + S = np.array([[1.0, shear, shiftx], [0.0, 1.0, shifty], [0.0, 0.0, 1.0]]) + + # Left/Right flip + F = np.eye(3) + if flip: + F[0][0] = -1 + F[0][-1] = w + + # TODO: expose operation order + A = F @ S @ R + + return A + + +def box_crop_affine_transform( + box_xyxy: Tuple[int, int, int, int], + target_shape: Tuple[int, int], +) -> np.ndarray: + """Generates a matrix that crops a rectangular area from an image and resizes it to target shape. + Note, this preserves the aspect ratio in target shape. + + Parameters + ---------- + box_xyxy: list or tuple + Box corners expressed as x1,y1,x2,y2. + target_shape: tuple + Desired image shape after cropping and resizing. + + Returns + ------- + A: np.ndarray + 3x3 matrix that expresses the requested transformation. + """ + # get box center + x1, y1, x2, y2 = box_xyxy + cx, cy = (x1 + x2) / 2, (y1 + y2) / 2 + w, h = x2 - x1, y2 - y1 + + target_aspect_ratio = target_shape[0] / target_shape[1] + + # keep the box height fixed and adjust the width + new_h = h + new_w = h / target_aspect_ratio + scale = target_shape[0] / new_h + + ax, ay = cx - new_w / 2, cy - new_h / 2 + + A = np.array([[scale, 0, -ax * scale], [0, scale, -ay * scale], [0, 0, 1.0]]) + return A + + +def scale_affine_transform(s: float) -> np.ndarray: + """Generates a matrix performs a unfirom scaling. + + Parameters + ---------- + s: float + scale factor + + Returns + ------- + A: np.ndarray + 3x3 matrix that expresses the requested transformation. + """ + return np.array([[s, 0.0, 0.0], [0.0, s, 0.0], [0.0, 0.0, 1]]) + + +def transform_box_2d(box: BoundingBox2D, A: np.ndarray) -> BoundingBox2D: + """Apply an affine transformation to a 2d box annotation. + + Parameters + ---------- + + box: BoundingBox2DAnnotation + Box to transform. + + A: np.ndarray + 3x3 transformation matrix + + Returns + ------- + box: BoundingBox2DAnnotation + Box annotation with updated positions. + """ + # get the corners of all the boxes + x1, y1, x2, y2 = box.ltrb + + points = np.array([[x1, y1, 1], [x1, y2, 1], [x2, y2, 1], [x2, y1, 1]]) + + new_points = (A[:2, :] @ points.T).T + x1, y1 = new_points.min(axis=0) + x2, y2 = new_points.max(axis=0) + # Note these new points could be outside of the image now. + # TODO: expose option to either clip, remove, or keep these boxes + + box.l = x1 + box.t = y1 + box.w = x2 - x1 + box.h = y2 - y1 + + return box + + +class AffineCameraTransform(BaseTransform): + """Base transform class for 2d geometric camera transformations. + This serves as a base to implement 2d image transforms such as scaling, rotation, left-right flips etc + as affine transforms. Doing so makes it very easy to apply the same transform to 2d box annotations and + semantic segmentation maps. Additionally by implementing transforms as a matrix multiplies, multiple transforms + can be implemented with a single multiply/remap without losing information along the image borders. + """ + def __init__( + self, + A: Optional[np.ndarray] = None, + shape: Optional[Union[Tuple[int, int], Tuple[int, int, int]]] = None, + fix_skew: bool = True, + ) -> None: + """Implements an affine transform to camera datum. + This operates on DGP camera datums (OrderedDict) and returns Camera datums. + + Parameters + ---------- + A: np.ndarray + 3x3 affine transformation matrix + + shape: tuple + Desired image shape after applying transformation + + fix_skew: bool + If true, attempt to remove skew from the operations to comply with camera classes + that do not model it. If using this, you are not guaranteed to be able to recover the inverse + operation by inverting the transformation matrix. + """ + + self.A = A + self.shape = shape + self.fix_skew = fix_skew + + def _calc_A( + self, + input_shape: Union[Tuple[int, int], Tuple[int, int, int]], # pylint : ignore unused + ) -> np.ndarray: + """Calculates transformation matrix as a function of input image shape. + + Parameters + ---------- + input_shape: tuple + Shape of input camera's image, i.e, (h,w,3) or (h,w). + + Returns + ------- + A: np.ndarray + 3x3 affine transformation matrix + """ + return self.A + + def _calc_shape( + self, input_shape: Union[Tuple[int, int], Tuple[int, int, int]] + ) -> Union[Tuple[int, int], Tuple[int, int, int]]: + """Calculates new shape of image after transformation as a function of input image shape. + + Parameters + ---------- + input_shape: tuple + Shape of input camera's image, i.e, (h,w,3) or (h,w). + + Returns + ------- + shape: tuple + new image shape. + """ + return self.shape + + def transform_image( + self, + img: Union[np.ndarray, PIL.Image.Image], + mode: int = cv2.INTER_LINEAR, + ) -> Union[np.ndarray, PIL.Image.Image]: + """Applies transformation to an image. + + Parameters + ---------- + img: np.ndarray or PIL.Image.Image + Input image expressed as a numpy array of type np.uint8 or np.float32, or, PIL Image. + + mode: int + Opencv flag for interpolation mode. When used on masks or label image, + should be set to cv2.INTER_NEAREST, otherwise defaults to bilnear interpolation. + NOTE: when image is a PIL Image, the correponding PIL flag is subsitituted automatically. + + Returns + ------- + new_img: np.ndarray or PIL.Image.Image + New transformed image. + """ + h, w = self.shape[:2] + + if isinstance(img, PIL.Image.Image): + # Note: PIL transform takes the inverse + m = np.linalg.inv(self.A)[:2, :].flatten() + + tx = AffineTransform(m) + if mode == cv2.INTER_LINEAR: + mode = PIL.Image.BILINEAR + elif mode == cv2.INTER_NEAREST: + mode = PIL.Image.NEAREST + + new_img = tx.transform((w, h), img, resample=mode) + else: + new_img = cv2.warpAffine(img, self.A[:2, :], (int(w), int(h)), mode) + + return new_img + + def transform_camera( + self, + cam_datum: Dict[str, Any], + ) -> Tuple[np.ndarray, Pose, Pose]: + """Transform camera intrinisics, extrinsics + + Parameters + ---------- + cam_datum : Dict[str,Any] + A dgp camera datum + + Returns + ------- + + mtxR: np.array + new camera intrinsics + new_pose: Pose + The new pose (sensor to global) + new_ext: Pose + The new extrinsics (sensor to local) + """ + + # Transform the camera matrix + h, w = self.shape[:2] + + # Flipping leads to the wrong rotation below, so when there is a flip present, we unflip, do everything, and re flip + flip_mat = np.eye(3) + flip = False + if self.A[0, 0] < 0: + flip = True + F = np.eye(3) + F[0][0] = -1 + F[0][-1] = w + flip_mat = F + + if self.A[1, 1] < 0: + flip = True + F2 = np.eye(3) + F2[1][1] = -1 + F2[1][-1] = h + flip_mat = F2 * flip_mat + + A = np.linalg.inv(flip_mat) @ self.A + + mtxR = A @ cam_datum['intrinsics'] + + # NOTE: some opencv functions expect the camera matrix to be upper triangular, so we have to stuff any rotation + # into the extrinsics. We decompose the new camera matrix into an upper triangular camera matrix and a rotation, + # and then bake that rotation into the extrinsics matrix. + _, mtxR, mtxQ, _, _, _ = cv2.RQDecomp3x3(mtxR) + + # The decomposition may have negated the last column, which in many applications assumes the [2,2] element is 1.0 + if mtxR[2, 2] < 0: + fix = np.eye(3) + fix[0, 0] = -1 + fix[2, 2] = -1 + mtxR = mtxR @ fix + mtxQ = fix.T @ mtxQ # fix.T == inv(fix) + + # Additionally DGP camera class does not model skewness, which limits this approach + # we can fix/force skew to be zero at the cost of (potentially) changing our rotation. + if self.fix_skew: + shear_mat = np.eye(3) + shear_mat[0, 1] = mtxR[0, 1] / mtxR[0, 0] # s/fx + mtxR = mtxR @ np.linalg.inv(shear_mat) + mtxQ = shear_mat @ mtxQ + + # This shear_mat may have ruined our rotation (rounding errors or invalid rotation) so re-orthogonalize + u, s, v = np.linalg.svd(mtxQ) + mtxQ = u @ v + + # Flip back + if flip: + mtxR = flip_mat @ mtxR + + new_R = (mtxQ @ cam_datum['pose'].rotation_matrix.T).T + new_pose = Pose().from_rotation_translation(new_R, cam_datum['pose'].tvec) + + new_R = (mtxQ @ cam_datum['extrinsics'].rotation_matrix.T).T + new_ext = Pose().from_rotation_translation(new_R, cam_datum['extrinsics'].tvec) + + # NOTE: distortion parameters are typically independent, so we do not modify them here + + return mtxR, new_pose, new_ext + + def transform_detections_3d( + self, boxes: BoundingBox3DAnnotationList, pose_correction: Pose + ) -> BoundingBox3DAnnotationList: + """Applies trasformation matrix to 3d cuboids + + Parameters + ---------- + boxes: BoundingBox3DAnnotationList + The 3d cuboids for this camera + + pose_correction: Pose + Pose used to correct and change in extrinsics due to rotations + + Returns + ------- + boxes: BoundingBox3DAnnotationList + """ + # Pose correction is only relevant for boxes in camera frame + boxes = deepcopy(boxes) + for b in boxes: + b._pose = pose_correction * b.pose + return boxes + + def transform_detections_2d( + self, + boxes: BoundingBox2DAnnotationList, + ) -> BoundingBox2DAnnotationList: + """Applies transformation matrix to list of bounding boxes: + + Parameters + ---------- + boxes: BoundingBox2DAnnotationList + List of bounding box annotations. + + Returns + ------- + new_boxes: BoundingBox2DAnnotationList + List of transformed bounding box annotations. + """ + new_boxes = deepcopy(boxes) + for box in new_boxes: + box = transform_box_2d(box, self.A) + + return new_boxes + + def transform_semantic_segmentation_2d( + self, + semantic_segmentation_2d: SemanticSegmentation2DAnnotation, + ) -> SemanticSegmentation2DAnnotation: + """Applies transformation to semantic segmentation annotation. + + Parameters + ---------- + semantic_segmentation_2d: SemanticSegmentation2DAnnotation + Semantic segmentation input + + Returns + ------- + new_sem_seg: SemanticSegmentation2DAnnotation + New transformed semantic segmentation annotation. + """ + new_sem_seg = deepcopy(semantic_segmentation_2d) + new_sem_seg._segmentation_image = self.transform_image(new_sem_seg._segmentation_image, mode=cv2.INTER_NEAREST) + return new_sem_seg + + def transform_depth( + self, + depth: DenseDepthAnnotation, + ) -> DenseDepthAnnotation: + """Applies transformation to depth annotation. + + Parameters + ---------- + depth: DenseDepthAnnotation + Depth input + + Returns + ------- + new_depth: DenseDepthAnnotation + New transformed depth annotation. + """ + new_depth = deepcopy(depth) + new_depth._depth = self.transform_image(new_depth._depth, mode=cv2.INTER_LINEAR) + # TODO: do we want to scale depth values by the new focal length? + return new_depth + + def transform_panoptic_segmentation_2d( + self, + panoptic_seg: PanopticSegmentation2DAnnotation, + ) -> PanopticSegmentation2DAnnotation: + """Applies transformation to panoptic segmentation annotation. + + Parameters + ---------- + panoptic_seg: PanopticSegmentation2DAnnotation + Panoptic segmentation input + + Returns + ------- + new_panoptic_seg: PanopticSegmentation2DAnnotation + New transformed panoptic segmentation annotation. + """ + if panoptic_seg is None: + return None + + new_panoptic_seg = deepcopy(panoptic_seg) + for panoptic in new_panoptic_seg: + panoptic._bitmask = self.transform_image(panoptic.bitmask.astype(np.float32), + mode=cv2.INTER_NEAREST).astype(np.bool) + + # TODO: how to treat zero mass results? ie if after a transformation, + # if the bitmask is all zeros, should we delete this mask? + # currently we just keep the mask + + return new_panoptic_seg + + def transform_mask_2d( + self, + mask: np.ndarray, + ) -> np.ndarray: + """Transform image mask + + Parameters + ---------- + mask: np.ndarray + A boolean mask of same shape as image that denotes a valid pixel + + Returns + ------- + new_mask: np.ndarray + """ + + if mask is None: + return None + + new_mask = self.transform_image(mask.astype(np.float32), mode=cv2.INTER_NEAREST).astype(np.bool) + + return new_mask + + def transform_keypoints_2d( + self, + keypoints: KeyPoint2DAnnotationList, + ) -> KeyPoint2DAnnotationList: + """Applies transformation matrix to list of keypoints: + + Parameters + ---------- + keypoints: KeyPoint2DAnnotationList + List of keypoint annotations. + + Returns + ------- + new_keypoints: Keypoint2DAnnotationList + List of transformed bounding keypoint annotations. + """ + logger.warning('keypoints not yet been tested, please use caution using this') + # TODO: test keypoint 2d + if keypoints is None: + return None + + new_keypoints = deepcopy(keypoints) + for kp in new_keypoints: + x, y = kp.point + new_pt = self.A[:2, :] @ np.array([x, y, 1]) + kp.point = np.float32([new_pt[0], new_pt[1]]) + # TODO: test this and vectorize it, probably don't need a loop here + + return new_keypoints + + def transform_datum(self, cam_datum: Dict[str, Any]) -> Dict[str, Any]: # pylint: disable=arguments-renamed + """Applies transformation to a camera datum. + + Parameters + ---------- + cam_datum: OrderedDict + Camera datum to transform. + + Returns + ------- + new_datum: OrderedDict + Camera datum with transformed image and annotations. + """ + + assert cam_datum['datum_type'] == 'image', 'expected an image datum_type' + + assert 'rgb' in cam_datum, 'datum should contain an image' + + new_datum = cam_datum.copy() + + # We support PIL and raw numpy arrays + if isinstance(new_datum['rgb'], PIL.Image.Image): + input_shape = new_datum['rgb'].size[::-1] + else: + input_shape = new_datum['rgb'].shape + + # NOTE: we call this here since in general the transformation matrix can depend on the input shape + self.A = self._calc_A(input_shape) + self.shape = self._calc_shape(input_shape) + if self.shape is None: + self.shape = input_shape + + new_datum['rgb'] = self.transform_image(new_datum['rgb']) + + mtx, pose, ext = self.transform_camera(new_datum) + if np.abs(mtx[0, 1]) > 1e-3: + logger.warning('Input camera matrix had skew, this may not work with downstream applications!') + + new_datum['intrinsics'] = mtx + new_datum['pose'] = pose + new_datum['extrinsics'] = ext + + # This is not actually part of DGP, but you define a mask for the image, we can keep track of points + # that are not part of that mask a result of these operations. + if 'rgb_mask' in new_datum: + rgb_mask = new_datum['rgb_mask'] + rgb_mask = self.transform_mask_2d(rgb_mask) + new_datum['rgb_mask'] = rgb_mask + + if 'bounding_box_3d' in new_datum: + # Note: DGP camera class does not model the full camera matrix just focal length and center + # if using DGP camera class, do not use transformations that add a skew! + boxes = new_datum['bounding_box_3d'] + pose_correction = new_datum['extrinsics'].inverse() * cam_datum['extrinsics'] + boxes = self.transform_detections_3d(boxes, pose_correction) + new_datum['bounding_box_3d'] = boxes + + if 'bounding_box_2d' in new_datum: + boxes = new_datum['bounding_box_2d'] + boxes = self.transform_detections_2d(boxes, ) + new_datum['bounding_box_2d'] = boxes + # TODO: remove zero w and h boxes + # TODO: clip to image size + # TODO: maybe convert back to int if input is int? + + if 'semantic_segmentation_2d' in new_datum: + sem_seg = new_datum['semantic_segmentation_2d'] + sem_seg = self.transform_semantic_segmentation_2d(sem_seg, ) + new_datum['semantic_segmentation_2d'] = sem_seg + + if 'depth' in new_datum: + depth = new_datum['depth'] + depth = self.transform_depth(depth, ) + new_datum['depth'] = depth + + if 'key_point_2d' in new_datum: + keypoints = new_datum['key_point_2d'] + keypoints = self.transform_keypoints_2d(keypoints, ) + new_datum['key_point_2d'] = keypoints + + if 'instance_segmentation_2d' in new_datum: + instance_seg = new_datum['instance_segmentation_2d'] + instance_seg = self.transform_panoptic_segmentation_2d(instance_seg, ) + new_datum['instance_segmentation_2d'] = instance_seg + + # TODO: verify behavior when Nonetype is passed for each annotation + # TODO: line 2d/3d annotations + # TODO: polygon annotation + # TODO: flow 2d + # TODO: generic feature type + + return new_datum + + +class ScaleAffineTransform(AffineCameraTransform): + def __init__(self, s: float) -> None: + """Scale a camera datum. + + Parameters + ---------- + s: float + Scale factor. + """ + self.s = s + self.A = scale_affine_transform(s) + super().__init__(A=self.A) + + def _calc_shape( + self, + input_shape: Union[Tuple[int, int], Tuple[int, int, int]], + ) -> Tuple[int, int]: + h, w = input_shape[:2] + shape = (int(h * self.s), int(w * self.s)) + return shape + + +class ScaleHeightTransform(AffineCameraTransform): + def __init__(self, h: int) -> None: + """Scale a camera datum to a specific image height. + + Parameters + ---------- + h: float + new height. + """ + assert h > 0 + self.h = h + super().__init__() + + def _calc_A( + self, + input_shape: Union[Tuple[int, int], Tuple[int, int, int]], + ) -> np.ndarray: + """Calculate transformation matrix. See AffineCameraTransform._calc_A""" + h, _ = input_shape[:2] + s = self.h / h + return scale_affine_transform(s) + + def _calc_shape( + self, + input_shape: Union[Tuple[int, int], Tuple[int, int, int]], + ) -> Tuple[int, int]: + h, w = input_shape[:2] + s = self.h / h + shape = (self.h, int(w * s)) + return shape + + +class CropScaleTransform(AffineCameraTransform): + def __init__(self, target_shape: Tuple[int, int], fix_h: bool = True) -> None: + """Extracts a crop from the center of an image and resizes to target_shape. + This attempts to match the aspect ratio of target_shape and does not stretch the crop. + + Parameters + ---------- + target_shape: tuple + Shape after transformation. + fix_h: bool, default=True + If True, fixes the height and modifies the width to maintain the desired aspect ratio. + Otherwise fixes the width and moifies the height. + """ + self.shape = target_shape[:2] + self.fix_h = fix_h + super().__init__(shape=self.shape) + + def _calc_A( + self, + input_shape: Union[Tuple[int, int], Tuple[int, int, int]], + ) -> np.ndarray: + """Calculate transformation matrix. See AffineCameraTransform._calc_A""" + + # Get the center crop box + h, w = input_shape[:2] + H, W = self.shape[:2] + aspect_ratio = H / W + if self.fix_h: # leaves h unchanged , crops the x + newx = w - h / aspect_ratio + box = [newx / 2, 0, w - newx / 2, h] + else: + newy = h - w * aspect_ratio + box = [0, newy / 2, w, h - newy] + + return box_crop_affine_transform(box, self.shape) + + +class CompositeAffineTransform(AffineCameraTransform): + def __init__(self, transforms: List[AffineCameraTransform]) -> None: + """Squashes multiple affine transformations into a single transformation. + + Parameters + ---------- + transforms: list of AffineCameraTransform + List of transformations to be executed from right to left. + """ + self.transforms = transforms + super().__init__() + + def _calc_A( + self, + input_shape: Union[Tuple[int, int], Tuple[int, int, int]], + ) -> np.ndarray: + """Calculate transformation matrix. See AffineCameraTransform._calc_A""" + A = np.eye(3) + for tr in reversed(self.transforms): + A = tr._calc_A(input_shape) @ A + input_shape = tr._calc_shape(input_shape) + return A + + def _calc_shape( + self, + input_shape: Union[Tuple[int, int], Tuple[int, int, int]], + ) -> Tuple[int, int]: + """Calculate output shape. See AffineCameraTransform._calc_shape""" + for tr in reversed(self.transforms): + input_shape = tr._calc_shape(input_shape) + return input_shape + + +class RandomCropTransform(AffineCameraTransform): + def __init__( + self, + crop_shape: Tuple[int, int], + ) -> None: + """Extracts random crops of crop_shape. + + Paramters + --------- + crop_shape: tuple + Shape after transformation + """ + self.shape = crop_shape + super().__init__(shape=self.shape) + + def _calc_A( + self, + input_shape: Union[Tuple[int, int], Tuple[int, int, int]], + ) -> np.ndarray: + """Calculate transformation matrix. See AffineCameraTransform._calc_A""" + # Sample a random crop of size self.shape + h, w = input_shape[:2] + h_target, w_target = self.shape + + xc = w / 2 + if w - w_target // 2 > w_target // 2: + xc = np.random.randint(w_target // 2, w - w_target // 2) + + yc = h / 2 + if h - h_target // 2 > h_target // 2: + yc = np.random.randint(h_target // 2, h - h_target // 2) + + box = [xc - w_target // 2, yc - h_target // 2, xc + w_target // 2, yc + h_target // 2] + return box_crop_affine_transform(box, self.shape) + + +class RandomAffineTransform(AffineCameraTransform): + def __init__(self, args: Optional[Dict[str, Dict[str, Union[float, bool]]]] = None) -> None: + """Applies a random affine transformation. + + Parameters + ---------- + args: dict of dict + Dictionary of augmentation values. Augmentation values are dictionaries with keys + 'center', 'low','high' and 'p'. Augementation values are sampled from a uniform + distribution [center+low, center+high] with probabilty p, otherwise the central value is returned. + """ + if args is None: + args = dict() + + for k in ['theta', 'scale', 'flip', 'shiftx', 'shifty', 'shear', 'flip']: + center = 0.0 + if k == 'scale': + center = 1.0 + + if k not in args: + args[k] = {'p': 0, 'center': center, 'low': 0, 'high': 0} + + self.args = args + super().__init__() + + def _calc_A( + self, + input_shape: Union[Tuple[int, int], Tuple[int, int, int]], + ) -> np.ndarray: + """Calculate transformation matrix. See AffineCameraTransform._calc_A""" + def maybe_sample(p, center, low, high): + if np.random.rand() <= p: + return center + ((high - low) * np.random.rand() + low) + return center + + theta_args = self.args['theta'] + theta = maybe_sample(**theta_args) + + scale_args = self.args['scale'] + scale = maybe_sample(**scale_args) + + flip_args = self.args['flip'] + flip = np.random.rand() < flip_args['p'] + + shiftx_args = self.args['shiftx'] + shiftx = maybe_sample(**shiftx_args) + + shiftx_args = self.args['shiftx'] + shiftx = maybe_sample(**shiftx_args) + + shifty_args = self.args['shifty'] + shifty = maybe_sample(**shifty_args) + + shear_args = self.args['shear'] + shear = maybe_sample(**shear_args) + + return calc_affine_transform(theta, scale, flip, shiftx, shifty, shear, input_shape[:2]) + + def _calc_shape( + self, + input_shape: Union[Tuple[int, int], Tuple[int, int, int]], + ) -> Tuple[int, int]: + """Calculate output shape. See AffineCameraTransform._calc_shape""" + # NOTE: this operation intentionally does not modify the output shape. If we zoom out, there will be black borders + return input_shape[:2] diff --git a/tests/test_camera_transforms.py b/tests/test_camera_transforms.py new file mode 100644 index 00000000..3199295a --- /dev/null +++ b/tests/test_camera_transforms.py @@ -0,0 +1,329 @@ +# Copyright 2021-2022 Woven Planet. All rights reserved. +import os +import unittest + +import cv2 +import numpy as np + +from dgp.annotations.camera_transforms import ( + AffineCameraTransform, + CompositeAffineTransform, + CropScaleTransform, + ScaleAffineTransform, + ScaleHeightTransform, + calc_affine_transform, +) +from dgp.datasets.synchronized_dataset import SynchronizedSceneDataset +from dgp.utils.visualization_utils import visualize_cameras +from tests import TEST_DATA_DIR + +# Flag to render test images +DEBUG = False + + +def assert_almost_mostly_equal(datum1, datum2, valid_region=None): + """Test if two camera datums are the same by comparing their annotations and rgb values. + Since we intend to use this to test operations that can remove information (like borders when you rotate) + we can compare values at a central region by passing valid_region. + + Parameters + ---------- + datum1: dict(str, any) + A camera datum + + datum2: dict(str, any) + Another camera datum + + valid_region: Tuple[float,float,float,float], default = None + An [x1,y2, x1,y2] region on which to compare image values. If None, uses entire image + """ + + # Check that we have the same keys + keys1 = set(datum1.keys()) + keys2 = set(datum2.keys()) + assert len(keys1 - keys2) == 0 + + if 'intrinsincs' in keys1: + assert np.allclose(datum1['intrinsics'], datum2['intrinsics']) + # Validate our assumptions about the form of the intrinsics: + # Upper triangular + assert np.allclose(datum2['intrinsics'], np.triu(datum2['intrinsincs'])) + # z scale is 1 + assert np.abs(datum2['intrinsics'][2, 2] - 1) < 1e-3 + # no skew! Note: this is actually valid, just not supported well + assert np.abs(datum2['intrinsics'][0, 1]) < 1e-3 + + if 'extrinsics' in keys1: + assert np.allclose( + datum1['extrinsics'].matrix, datum2['extrinsics'].matrix, atol=1e-03 + ), f"{datum1['extrinsics'].rotation_matrix}, {datum2['extrinsics'].rotation_matrix}" + + if 'pose' in keys1: + assert np.allclose(datum1['pose'].matrix, datum2['pose'].matrix, atol=1e-3) + + if 'rgb' in keys1: + rgb1 = np.array(datum1['rgb']) + rgb2 = np.array(datum2['rgb']) + + assert rgb1.shape == rgb2.shape + + if valid_region is None: + h, w = rgb1.shape[:2] + x1, y1, x2, y2 = (0, 0, w, h) + else: + x1, y1, x2, y2 = valid_region + + rgb1 = rgb1[y1:y2, x1:x2] + rgb2 = rgb2[y1:y2, x1:x2] + + if DEBUG: + idx = np.random.randint(1000) + cv2.imwrite(f'rgb_{idx}.jpeg', rgb1) + cv2.imwrite(f'rgb_{idx}_2.jpeg', rgb2) + + # We cannot directly compare two images. One image may have been scaled heavily and therefore blurred + # so we compare peak signal to noise. The threshold here is abitrary and set manually + # TODO: get better threshold + assert cv2.PSNR(rgb1, rgb2) >= 30.0 + + if 'bounding_box_2d' in keys1: + # We cannot easily test bounding box 2d. This is because when we transform the corners + # we then replace the box with the smallest axis aligned box that contains those corners. + # Consider for example rotating an image by 45 degrees. The resulting box in the rotated version + # will be much larger than the original box. When we rotate back by -45, the box will yet again + # be bigger. We can however at least test that center of the box has not changed. + boxes1 = datum1['bounding_box_2d'] + boxes2 = datum2['bounding_box_2d'] + + assert len(boxes1) == len(boxes2) + + for box1, box2 in zip(boxes1, boxes2): + x1, y1, x2, y2 = box1.ltrb + center1 = np.array([(x2 + x1) / 2, (y2 + y1) / 2]) + x1, y1, x2, y2 = box2.ltrb + center2 = np.array([(x2 + x1) / 2, (y2 + y1) / 2]) + assert np.allclose(center1, center2, atol=1e-3) + + if 'bounding_box_3d' in keys1: + boxes1 = datum1['bounding_box_3d'] + boxes2 = datum2['bounding_box_3d'] + + assert len(boxes1) == len(boxes2) + + for box1, box2 in zip(boxes1, boxes2): + assert np.allclose(box1.corners, box2.corners, atol=1e-3), f'{box1.corners}, {box2.corners}' + assert box1.class_id == box2.class_id + + if 'rgb' in keys1 and 'bounding_box_3d' in keys1: + # Render the cuboids on the image, check that both images are similar + rgb1 = visualize_cameras( + [datum1], + {i: '' + for i in range(100)}, + None, + )[0] + + rgb2 = visualize_cameras( + [datum2], + {i: '' + for i in range(100)}, + None, + )[0] + + assert rgb1.shape == rgb2.shape + + if valid_region is None: + h, w = rgb1.shape[:2] + x1, y1, x2, y2 = (0, 0, w, h) + else: + x1, y1, x2, y2 = valid_region + + rgb1 = rgb1[y1:y2, x1:x2] + rgb2 = rgb2[y1:y2, x1:x2] + + if DEBUG: + idx = np.random.randint(1000) + cv2.imwrite(f'box_vis_{idx}.jpeg', rgb1) + cv2.imwrite(f'box_vis_{idx}_2.jpeg', rgb2) + + assert cv2.PSNR(rgb1, rgb2) >= 20.0 + + # TODO: test other annotations + + +class TestTransforms(unittest.TestCase): + """Test camera datum transformations""" + DGP_TEST_DATASET_DIR = os.path.join(TEST_DATA_DIR, "dgp") + + def setUp(self): + # Initialize synchronized dataset + scenes_dataset_json = os.path.join(self.DGP_TEST_DATASET_DIR, "test_scene", "scene_dataset_v1.0.json") + self.dataset = SynchronizedSceneDataset( + scenes_dataset_json, + split='train', + datum_names=[ + 'camera_01', + ], + backward_context=0, + requested_annotations=( + "bounding_box_2d", + "bounding_box_3d", + ) + ) + + def test_affine_transform(self): + """Test base class by generating a transform, applying it, and then applying the inverse""" + cam_datum = self.dataset[0][0][0] + + # Initial image size. Note: pil size is w,h not h,w + w, h = cam_datum['rgb'].size + A = calc_affine_transform(theta=45, scale=.9, flip=1, shiftx=10, shifty=-20, shear=0, img_shape=(h, w)) + tr = AffineCameraTransform(A=A, shape=(h, w)) + cam_datum2 = tr(cam_datum) + + if DEBUG: + rgb_viz = visualize_cameras( + [cam_datum2], + {i: '' + for i in range(32)}, + None, + ) + cv2.imwrite('affine_test_intermediate.jpeg', rgb_viz[0]) + + # Test round trip. We should be able to (mostly) recover the initial datum + Ainv = np.linalg.inv(tr.A) + target_shape = (h, w) + tr_inv = AffineCameraTransform(A=Ainv, shape=target_shape) + + cam_datum3 = tr_inv(cam_datum2) + + # Due to border issues, only check rgb values at central region + dw = w // 4 + dh = h // 4 + valid_region = (dw, dh, w - dw, h - dh) + + assert_almost_mostly_equal(cam_datum, cam_datum3, valid_region=valid_region) + + def test_scale_transform(self): + """Test scale transform""" + + cam_datum = self.dataset[0][0][0] + + # Initial image size. Note: pil size is w,h not h,w + w, h = cam_datum['rgb'].size + + s = .5 + tr = ScaleAffineTransform(s) + cam_datum2 = tr(cam_datum) + + w2, h2 = cam_datum2['rgb'].size + assert int(w * s) == w2 + assert int(h * s) == h2 + + # Apply the inverse transform and verify everything is the same + tr_inv = ScaleAffineTransform(1 / s) + cam_datum3 = tr_inv(cam_datum2) + + assert_almost_mostly_equal(cam_datum, cam_datum3) + + def test_scale_height_transform(self): + """Test scale by height transform""" + cam_datum = self.dataset[0][0][0] + _, h = cam_datum['rgb'].size + + s = 2 + hs = int(h * s) + tr = ScaleHeightTransform(hs) + cam_datum2 = tr(cam_datum) + + _, h2 = cam_datum2['rgb'].size + assert hs == h2 + + # Apply the inverse transform and verify everything is the same + hs = int(hs * 1 / s) + tr_inv = ScaleHeightTransform(hs) + cam_datum3 = tr_inv(cam_datum2) + + assert_almost_mostly_equal(cam_datum, cam_datum3) + + def test_crop_scale_transform(self): + """Test the crop transform""" + cam_datum = self.dataset[0][0][0] + w, h = cam_datum['rgb'].size + + target_shape = (h // 2, w // 2) + tr = CropScaleTransform(target_shape=target_shape, fix_h=True) + cam_datum2 = tr(cam_datum) + + w2, h2 = cam_datum2['rgb'].size + assert target_shape == (h2, w2) + + # Apply the inverse transform and verify everything is the same + Ainv = np.linalg.inv(tr.A) + target_shape = (h, w) + tr_inv = AffineCameraTransform(A=Ainv, shape=target_shape) + + cam_datum3 = tr_inv(cam_datum2) + + # Get a region that should be unchanged. There is no way for the inverse + # transform to restore the borders we cropped. So we cannot evaluate them + dw = (w - w2) // 2 + 1 + dh = (h - h2) // 2 + 1 + valid_region = (dw, dh, w - dw, h - dh) + + assert_almost_mostly_equal(cam_datum, cam_datum3, valid_region=valid_region) + + def test_composite_transform(self): + """Test that we can compose transforms correctly. We test that we can get the + same datum by applying multiple transformation consecutively vs all at once. + We also test that apply the inverse works correctly""" + cam_datum = self.dataset[0][0][0] + + w, h = cam_datum['rgb'].size + A1 = calc_affine_transform(theta=15, scale=1, flip=0, shiftx=0, shifty=0, shear=0, img_shape=(h, w)) + tr1 = AffineCameraTransform(A1, (h, w)) # rotation + + A2 = calc_affine_transform(theta=0, scale=1, flip=1, shiftx=0, shifty=0, shear=0, img_shape=(h, w)) + tr2 = AffineCameraTransform(A2, (h, w)) # left right flip + + tr_comp = CompositeAffineTransform(transforms=[tr2, tr1], ) + + # The chained transform + cam_datum2 = tr2(tr1(cam_datum)) + + # The composite transform + cam_datum3 = tr_comp(cam_datum) + + # get a valid region in the center. The composite transform might actuall preseve some of the border information + # that would otherwise be lost in a sequential operation. + dw = w // 4 + dh = h // 4 + valid_region = (dw, dh, w - dw, h - dh) + + assert_almost_mostly_equal(cam_datum2, cam_datum3, valid_region=valid_region) + + # test round trip + Ainv = np.linalg.inv(tr_comp.A) + target_shape = (h, w) + tr_inv = AffineCameraTransform(A=Ainv, shape=target_shape) + + cam_datum4 = tr_inv(cam_datum3) + + dw = w // 4 + dh = h // 4 + valid_region = (dw, dh, w - dw, h - dh) + + assert_almost_mostly_equal(cam_datum, cam_datum4, valid_region=valid_region) + + # Finally, if we compose with the inverse, nothing should really happen + tr_comp = CompositeAffineTransform(transforms=[tr2, tr1, tr_inv], ) + cam_datum5 = tr_comp(cam_datum) + assert np.allclose(tr_comp.A, np.eye(3)) + assert_almost_mostly_equal( + cam_datum, + cam_datum5, + ) + + +if __name__ == "__main__": + unittest.main()