diff --git a/app/packages/looker/package.json b/app/packages/looker/package.json index ae997e4b95..a150cccaf1 100644 --- a/app/packages/looker/package.json +++ b/app/packages/looker/package.json @@ -26,6 +26,7 @@ "lodash": "^4.17.21", "lru-cache": "^6.0.0", "mime": "^2.5.2", + "monotone-convex-hull-2d": "^1.0.1", "uuid": "^8.3.2" }, "devDependencies": { diff --git a/app/packages/looker/src/overlays/detection.ts b/app/packages/looker/src/overlays/detection.ts index f95dfaefba..4930771692 100644 --- a/app/packages/looker/src/overlays/detection.ts +++ b/app/packages/looker/src/overlays/detection.ts @@ -21,6 +21,7 @@ export interface DetectionLabel extends RegularLabel { dimensions?: [number, number, number]; location?: [number, number, number]; rotation?: [number, number, number]; + convexHull?: Coordinates[]; } export default class DetectionOverlay< @@ -229,48 +230,32 @@ export default class DetectionOverlay< state: Readonly, color: string ) { - const [tlx, tly, w, h] = this.label.bounding_box; - const [boxCenterX, boxCenterY] = t(state, tlx + w / 2, tly + h / 2); - - const hasRotationAroundZAxis = - this.label.rotation && this.label.rotation[2] !== 0; - - if (hasRotationAroundZAxis) { - // translate to center of box before rotating - ctx.translate(boxCenterX, boxCenterY); - // modifies current transformation matrix so that all subsequent drawings are rotated - ctx.rotate(-this.label.rotation[2]); - // translate back to undo the translation into the center of the box - ctx.translate(-boxCenterX, -boxCenterY); - } + const convexHull = this.label.convexHull; const previousAlpha = ctx.globalAlpha; - ctx.beginPath(); // use double stoke width to make the box more visible ctx.lineWidth = state.strokeWidth * 2; ctx.fillStyle = color; ctx.strokeStyle = color; - ctx.moveTo(...t(state, tlx, tly)); - ctx.lineTo(...t(state, tlx + w, tly)); - ctx.lineTo(...t(state, tlx + w, tly + h)); - ctx.lineTo(...t(state, tlx, tly + h)); + + ctx.beginPath(); + + // draw a polyline that defines the convex hull of the projected corners and fill it + ctx.moveTo(...t(state, convexHull[0][0], convexHull[0][1])); + for (let i = 1; i < convexHull.length; i++) { + ctx.lineTo(...t(state, convexHull[i][0], convexHull[i][1])); + } + ctx.closePath(); ctx.stroke(); // fill with some transparency - ctx.globalAlpha = state.options.alpha * 0.5; - ctx.fillRect(...t(state, tlx, tly), w, h); + ctx.globalAlpha = state.options.alpha * 0.3; + ctx.fill(); // restore previous alpha ctx.globalAlpha = previousAlpha; - - if (hasRotationAroundZAxis) { - // undo rotation to reset current transformation matrix - ctx.translate(boxCenterX, boxCenterY); - ctx.rotate(this.label.rotation[2]); - ctx.translate(-boxCenterX, -boxCenterY); - } } private strokeRect( diff --git a/app/packages/looker/src/state.ts b/app/packages/looker/src/state.ts index b350bfcca6..ae6d57a67e 100644 --- a/app/packages/looker/src/state.ts +++ b/app/packages/looker/src/state.ts @@ -65,8 +65,9 @@ export type OrthogrpahicProjectionMetadata = { filepath: string; height: number; width: number; - min_bound: [number, number]; - max_bound: [number, number]; + min_bound: [number, number, number]; + max_bound: [number, number, number]; + normal: [number, number, number]; }; export type GenericLabel = { diff --git a/app/packages/looker/src/worker/label-3d-projection-utils.test.ts b/app/packages/looker/src/worker/label-3d-projection-utils.test.ts new file mode 100644 index 0000000000..7ade666097 --- /dev/null +++ b/app/packages/looker/src/worker/label-3d-projection-utils.test.ts @@ -0,0 +1,22 @@ +import { describe, expect, it } from "vitest"; +import { Vec3, projectTo2D } from "./label-3d-projection-utils"; + +describe("projectTo2D", () => { + it("should project a point to the xz plane", () => { + const point: Vec3 = [1, 2, 3]; + const projectedPoint = projectTo2D(point, "xz"); + expect(projectedPoint).toEqual([1, 3]); + }); + + it("should project a point to the xy plane", () => { + const point: Vec3 = [1, 2, 3]; + const projectedPoint = projectTo2D(point, "xy"); + expect(projectedPoint).toEqual([1, 2]); + }); + + it("should project a point to the yz plane", () => { + const point: Vec3 = [1, 2, 3]; + const projectedPoint = projectTo2D(point, "yz"); + expect(projectedPoint).toEqual([2, 3]); + }); +}); diff --git a/app/packages/looker/src/worker/label-3d-projection-utils.ts b/app/packages/looker/src/worker/label-3d-projection-utils.ts new file mode 100644 index 0000000000..1e6e2b9cfa --- /dev/null +++ b/app/packages/looker/src/worker/label-3d-projection-utils.ts @@ -0,0 +1,83 @@ +import { Euler, Vector3 } from "three"; + +export type Vec3 = [number, number, number]; +export type Vec2 = [number, number]; + +export interface BoundingBox3D { + dimensions: Vec3; + location: Vec3; + rotation: Vec3; // rotation angles in radians +} + +export interface BoundingBox2D { + tlx: number; // top-left corner of the bounding box, x + tly: number; // top-left corner of the bounding box, y + width: number; // width of the bounding box + height: number; // height of the bounding box +} + +export const rotatePoint = (point: Vec3, rotation: Vec3): Vec3 => { + const threePoint = new Vector3(...point); + const threeRotation = new Euler(...rotation); + + return threePoint.applyEuler(threeRotation).toArray() as Vec3; +}; + +export const projectTo2D = (point: Vec3, plane: "xz" | "xy" | "yz"): Vec2 => { + switch (plane) { + case "xz": + return [point[0], point[2]]; + case "xy": + return [point[0], point[1]]; + case "yz": + return [point[1], point[2]]; + } +}; + +export const getProjectedCorners = ( + box: BoundingBox3D, + plane: "xz" | "xy" | "yz" +) => { + const { dimensions, location, rotation } = box; + const [dx, dy, dz] = dimensions; + const halfDimensions = [dx / 2, dy / 2, dz / 2] as Vec3; + + // Generate the 8 corners of the 3D bounding box + const corners: Vec3[] = [ + // left bottom back + [-halfDimensions[0], -halfDimensions[1], -halfDimensions[2]], + // left bottom front + [-halfDimensions[0], -halfDimensions[1], halfDimensions[2]], + // left top back + [-halfDimensions[0], halfDimensions[1], -halfDimensions[2]], + // left top front + [-halfDimensions[0], halfDimensions[1], halfDimensions[2]], + // right bottom back + [halfDimensions[0], -halfDimensions[1], -halfDimensions[2]], + // right bottom front + [halfDimensions[0], -halfDimensions[1], halfDimensions[2]], + // right top back + [halfDimensions[0], halfDimensions[1], -halfDimensions[2]], + // right top front + [halfDimensions[0], halfDimensions[1], halfDimensions[2]], + ]; + + // rotate first, and translate + const transformedCorners = corners.map((corner) => { + const newRotation = rotation; + + const rotated = rotatePoint(corner, newRotation); + return [ + rotated[0] + location[0], + rotated[1] + location[1], + rotated[2] + location[2], + ] as Vec3; + }); + + // project the 3D points to 2D based on the specified plane + const projectedCorners: Vec2[] = transformedCorners.map((corner) => + projectTo2D(corner, plane) + ); + + return { projectedCorners }; +}; diff --git a/app/packages/looker/src/worker/label-3d-transformation.ts b/app/packages/looker/src/worker/label-3d-transformation.ts deleted file mode 100644 index d5fd7a95dc..0000000000 --- a/app/packages/looker/src/worker/label-3d-transformation.ts +++ /dev/null @@ -1,59 +0,0 @@ -export const getTransformedCoordinates = ( - location, - dimensions, - scalingFactors, - orthographicProjectionParams, - { round = true } -) => { - // location of centroid of box - const [x, y] = location; - - const [lx, ly] = dimensions; - - const [_, __, xminCartesian, xmaxCartesian, yminCartesian, ymaxCartesian] = - orthographicProjectionParams; - - const canvasXMin = - scalingFactors.xScale * (x - lx / 2 + (xmaxCartesian - xminCartesian) / 2); - const canvasYMin = - scalingFactors.yScale * (y - ly / 2 + (ymaxCartesian - yminCartesian) / 2); - - const canvasXMax = - scalingFactors.xScale * (x + lx / 2 + (xmaxCartesian - xminCartesian) / 2); - const canvasYMax = - scalingFactors.yScale * (y + ly / 2 + (ymaxCartesian - yminCartesian) / 2); - - if (round) { - return [ - Math.round(canvasXMin), - Math.round(canvasXMax), - Math.round(canvasYMin), - Math.round(canvasYMax), - ]; - } - - return [canvasXMin, canvasXMax, canvasYMin, canvasYMax]; -}; - -export const applyRotation = (x, y, z, rotX, rotY, rotZ) => { - const cosx = Math.cos(rotX); - const cosy = Math.cos(rotY); - const cosz = Math.cos(rotZ); - const sinx = Math.sin(rotX); - const siny = Math.sin(rotY); - const sinz = Math.sin(rotZ); - - // Apply rotation in x-axis - const y1 = y * cosx - z * sinx; - const z1 = y * sinx + z * cosx; - - // Apply rotation in y-axis - const x2 = x * cosy - z1 * siny; - const z2 = x * siny + z1 * cosy; - - // Apply rotation in z-axis - const x3 = x2 * cosz - y1 * sinz; - const y3 = x2 * sinz + y1 * cosz; - - return [x3, y3, z2]; -}; diff --git a/app/packages/looker/src/worker/threed-label-processor.ts b/app/packages/looker/src/worker/threed-label-processor.ts index dc11c0f074..eaf19eafd6 100644 --- a/app/packages/looker/src/worker/threed-label-processor.ts +++ b/app/packages/looker/src/worker/threed-label-processor.ts @@ -1,7 +1,12 @@ import { DETECTIONS, getCls, Schema } from "@fiftyone/utilities"; +import ch from "monotone-convex-hull-2d"; import { POINTCLOUD_OVERLAY_PADDING } from "../constants"; import { DetectionLabel } from "../overlays/detection"; import { OrthogrpahicProjectionMetadata, Sample } from "../state"; +import { + BoundingBox3D, + getProjectedCorners, +} from "./label-3d-projection-utils"; import { mapId } from "./shared"; type DetectionsLabel = { @@ -10,47 +15,8 @@ type DetectionsLabel = { type ThreeDLabel = DetectionsLabel | DetectionLabel; -type LabelId = string; - const COLLECTION_TYPES = new Set(["Detections"]); -const scalingFactorCache: Record< - LabelId, - { - scalingFactor?: { xScale: number; yScale: number }; - } -> = {}; - -/** - * Get scaling parameters from pointcloud bound range. - * - * Cache results of this function because it is called for every label in a sample. - */ -const getScalingFactorForLabel = ( - labelId: LabelId, - width: number, - height: number, - xmin: number, - xmax: number, - ymin: number, - ymax: number -) => { - if (scalingFactorCache[labelId]?.scalingFactor) { - return scalingFactorCache[labelId].scalingFactor; - } - - if (!scalingFactorCache[labelId]) { - scalingFactorCache[labelId] = {}; - } - - scalingFactorCache[labelId].scalingFactor = { - xScale: width / (xmax - xmin), - yScale: height / (ymax - ymin), - }; - - return scalingFactorCache[labelId].scalingFactor; -}; - // cache between sample id and inferred projection params const inferredParamsCache: Record< Sample["id"], @@ -103,6 +69,7 @@ const getInferredParamsForUndefinedProjection = ( inferredParamsCache[sample.id] = { width: minX === Infinity ? 512 : maxX - minX + POINTCLOUD_OVERLAY_PADDING, height: minY === Infinity ? 512 : maxY - minY + POINTCLOUD_OVERLAY_PADDING, + normal: [0, 0, 1], min_bound: [ minX === Infinity ? -POINTCLOUD_OVERLAY_PADDING @@ -110,6 +77,7 @@ const getInferredParamsForUndefinedProjection = ( minY === Infinity ? -POINTCLOUD_OVERLAY_PADDING : minY - POINTCLOUD_OVERLAY_PADDING, + 0, ], max_bound: [ maxX === Infinity @@ -118,6 +86,7 @@ const getInferredParamsForUndefinedProjection = ( maxY === Infinity ? POINTCLOUD_OVERLAY_PADDING : maxY + POINTCLOUD_OVERLAY_PADDING, + 0, ], } as OrthogrpahicProjectionMetadata; @@ -139,35 +108,67 @@ const PainterFactory3D = ( * Impute bounding box parameters. */ Detection: (label: DetectionLabel) => { - const { - width: canvasWidth, - height: canvasHeight, - min_bound, - max_bound, - } = orthographicProjectionParams; - const [xmin, ymin] = min_bound; - const [xmax, ymax] = max_bound; - - const [x, y, z] = label.location; // centroid of bounding box - const [lx, ly, lz] = label.dimensions; // length of bounding box in each dimension - - const { xScale, yScale } = getScalingFactorForLabel( - label._id, - canvasWidth, - canvasHeight, - xmin, - xmax, - ymin, - ymax - ); + const { min_bound, max_bound, normal } = orthographicProjectionParams; + const [xmin, ymin, zmin] = min_bound; + const [xmax, ymax, zmax] = max_bound; + + const [lx, ly, lz] = label.location; // centroid of bounding box + const [dx, dy, dz] = label.dimensions; // length of bounding box in each dimension + const [rx, ry, rz] = label.rotation ?? [0, 0, 0]; // rotation of bounding box + + const [nx, ny, nz] = normal ?? [0, 0, 1]; + + const box: BoundingBox3D = { + dimensions: [dx, dy, dz], + location: [lx, ly, lz], + rotation: [rx, ry, rz], + }; + + let projectionPlane: "xy" | "xz" | "yz" = "xy"; + + if (nx === 1 || nx === -1) { + // project on yz plane + projectionPlane = "yz"; + } else if (ny === 1 || ny === -1) { + // project on xz plane + projectionPlane = "xz"; + } else if (nz === 1 || nz === -1) { + // project on xy plane + projectionPlane = "xy"; + } + + const { projectedCorners } = getProjectedCorners(box, projectionPlane); + + const xRange = xmax - xmin; + const yRange = ymax - ymin; + const zRange = zmax - zmin; + + const newProjectedCorners = projectedCorners.map(([x, y]) => { + let px, py; + + // todo: need to account for negative / positive normals + switch (projectionPlane) { + case "xy": + px = (x - xmin) / xRange; + py = (ymax - y) / yRange; + break; + case "xz": + px = (x - xmin) / xRange; + py = (zmax - y) / zRange; + break; + case "yz": + px = (y - ymin) / yRange; + py = (zmax - x) / zRange; + break; + } + return [px, py]; + }); - const tlx = (xScale * (x - lx / 2 - xmin)) / canvasWidth; // top left x, normalized to [0, 1] - const tly = (yScale * (-y - ly / 2 + ymax)) / canvasHeight; // top left y, normalized to [0, 1] + const convexHullIndices = ch(newProjectedCorners); - const boxWidth = (lx * xScale) / canvasWidth; // width of projected bounding box, normalized to [0, 1] - const boxHeight = (ly * yScale) / canvasHeight; // height of projected bounding box, normalized to [0, 1] + const convexHull = convexHullIndices.map((i) => newProjectedCorners[i]); - label.bounding_box = [tlx, tly, boxWidth, boxHeight]; + label.convexHull = convexHull; }, }); diff --git a/app/yarn.lock b/app/yarn.lock index d36348f2a7..fb3e5fe8b6 100644 --- a/app/yarn.lock +++ b/app/yarn.lock @@ -2602,6 +2602,7 @@ __metadata: lodash: ^4.17.21 lru-cache: ^6.0.0 mime: ^2.5.2 + monotone-convex-hull-2d: ^1.0.1 prettier: ^2.7.1 typescript: ^4.7.4 typescript-plugin-css-modules: ^5.0.2 @@ -14024,6 +14025,15 @@ __metadata: languageName: node linkType: hard +"monotone-convex-hull-2d@npm:^1.0.1": + version: 1.0.1 + resolution: "monotone-convex-hull-2d@npm:1.0.1" + dependencies: + robust-orientation: ^1.1.3 + checksum: 2d788534b29ab568387e2da43057e3fa9912fbac5e73a9e1bd78fae15951258c66d2e4655cdf2df4db7a944f1db619828030ba4824ac5fe794edefd8e8377440 + languageName: node + linkType: hard + "mouse-change@npm:^1.4.0": version: 1.4.0 resolution: "mouse-change@npm:1.4.0" @@ -16442,6 +16452,42 @@ __metadata: languageName: node linkType: hard +"robust-orientation@npm:^1.1.3": + version: 1.2.1 + resolution: "robust-orientation@npm:1.2.1" + dependencies: + robust-scale: ^1.0.2 + robust-subtract: ^1.0.0 + robust-sum: ^1.0.0 + two-product: ^1.0.2 + checksum: 83b87300009716d96cf17af27b2c787bb7cabe00e82b6740ff4777a601babfcf132b3ec3d10cb1a91886423aa51863026d3befd58058af3b90be98abbda0056e + languageName: node + linkType: hard + +"robust-scale@npm:^1.0.2": + version: 1.0.2 + resolution: "robust-scale@npm:1.0.2" + dependencies: + two-product: ^1.0.2 + two-sum: ^1.0.0 + checksum: 4217f15c94bc803c0c78f6011507102cb603a4e9f71721d44e155c17c1fbe989382c8a150d20e23ca51164077395dab698498b9650d2377cc0a69902d73d0a1c + languageName: node + linkType: hard + +"robust-subtract@npm:^1.0.0": + version: 1.0.0 + resolution: "robust-subtract@npm:1.0.0" + checksum: e9dcc39a1a802d4a34d338844d9382ad7e49f58c5d01ce0d66cd18d6477069475af11a80fba0c0e158211c2b272c1c05950e78cbfc29ea7005f4ecc9e9f9d492 + languageName: node + linkType: hard + +"robust-sum@npm:^1.0.0": + version: 1.0.0 + resolution: "robust-sum@npm:1.0.0" + checksum: b9f32829ba3d6fd9cffeee440e1fb93a7d42f264540bd631abf13d0e8737f3a15a16a15764fa8a2fe86d3db6a1970361cf7ad2ed536c858b59e45f6f493a454b + languageName: node + linkType: hard + "rollup-plugin-external-globals@npm:^0.6.1": version: 0.6.1 resolution: "rollup-plugin-external-globals@npm:0.6.1" @@ -18120,6 +18166,20 @@ __metadata: languageName: node linkType: hard +"two-product@npm:^1.0.2": + version: 1.0.2 + resolution: "two-product@npm:1.0.2" + checksum: b289814957df58b91c910c944e7e247aa01a0a70e8fafdf58f01baf7fa1f96c06dc1cbb6cdafb39525e9a5ac0a9566875f1a76a02ef1f736f26e56fca2f0c847 + languageName: node + linkType: hard + +"two-sum@npm:^1.0.0": + version: 1.0.0 + resolution: "two-sum@npm:1.0.0" + checksum: 2c6a995b555233b989f473a5d039bd237d75f4824b9b54dc9d9ab28157f3e412b37156acbb48b322c817a26f3cc85e3da281c9aed4b06e892d2d27ae88db7d32 + languageName: node + linkType: hard + "type-check@npm:^0.4.0, type-check@npm:~0.4.0": version: 0.4.0 resolution: "type-check@npm:0.4.0" diff --git a/docs/source/images/dataset_zoo/quickstart-3d.png b/docs/source/images/dataset_zoo/quickstart-3d.png new file mode 100644 index 0000000000..a75807e654 Binary files /dev/null and b/docs/source/images/dataset_zoo/quickstart-3d.png differ diff --git a/docs/source/images/datasets/quickstart-3d.gif b/docs/source/images/datasets/quickstart-3d.gif new file mode 100644 index 0000000000..ff08b236a7 Binary files /dev/null and b/docs/source/images/datasets/quickstart-3d.gif differ diff --git a/docs/source/images/datasets/quickstart-groups.gif b/docs/source/images/datasets/quickstart-groups.gif new file mode 100644 index 0000000000..3d04fdab2a Binary files /dev/null and b/docs/source/images/datasets/quickstart-groups.gif differ diff --git a/docs/source/images/datasets/quickstart-video.gif b/docs/source/images/datasets/quickstart-video.gif new file mode 100644 index 0000000000..b76d257eae Binary files /dev/null and b/docs/source/images/datasets/quickstart-video.gif differ diff --git a/docs/source/images/datasets/quickstart.gif b/docs/source/images/datasets/quickstart.gif new file mode 100644 index 0000000000..6bdf4284b8 Binary files /dev/null and b/docs/source/images/datasets/quickstart.gif differ diff --git a/docs/source/user_guide/dataset_zoo/datasets.rst b/docs/source/user_guide/dataset_zoo/datasets.rst index 29cc2a10f9..a41a260c28 100644 --- a/docs/source/user_guide/dataset_zoo/datasets.rst +++ b/docs/source/user_guide/dataset_zoo/datasets.rst @@ -78,6 +78,8 @@ This page lists all of the datasets available in the Dataset Zoo. +--------------------------------------------------------------------+---------------------------------------------------------------------------+ | :ref:`Quickstart Groups ` | image, point-cloud, quickstart | +--------------------------------------------------------------------+---------------------------------------------------------------------------+ + | :ref:`Quickstart 3D ` | 3d, point-cloud, mesh, quickstart | + +--------------------------------------------------------------------+---------------------------------------------------------------------------+ | :ref:`Sama-COCO ` | image, detection, segmentation | +--------------------------------------------------------------------+---------------------------------------------------------------------------+ | :ref:`UCF101 ` | video, action-recognition | @@ -3450,6 +3452,57 @@ annotation data. :alt: quickstart-groups :align: center +.. _dataset-zoo-quickstart-3d: + +Quickstart 3D +------------- + +A small 3D dataset with meshes, point clouds, and oriented bounding boxes. + +The dataset consists of 200 3D mesh samples from the test split of the +`ModelNet40 `_ dataset, with point +clouds generated using a Poisson disk sampling method, and oriented +bounding boxes generated based on the convex hull. + +Objects have been rescaled and recentered from the original dataset. + +**Details** + +- Dataset name: ``quickstart-3d`` +- Dataset size: 215.7 MB +- Tags: ``3d, point-cloud, mesh, quickstart`` +- Supported splits: ``N/A`` +- ZooDataset class: + :class:`Quickstart3DDataset ` + +**Example usage** + +.. tabs:: + + .. group-tab:: Python + + .. code-block:: python + :linenos: + + import fiftyone as fo + import fiftyone.zoo as foz + + dataset = foz.load_zoo_dataset("quickstart-3d") + + session = fo.launch_app(dataset) + + .. group-tab:: CLI + + .. code-block:: shell + + fiftyone zoo datasets load quickstart-3d + + fiftyone app launch quickstart-3d + +.. image:: /images/dataset_zoo/quickstart-3d.png + :alt: quickstart-3d + :align: center + .. _dataset-zoo-sama-coco: Sama-COCO diff --git a/docs/source/user_guide/using_datasets.rst b/docs/source/user_guide/using_datasets.rst index 7710490ae1..8748e852fa 100644 --- a/docs/source/user_guide/using_datasets.rst +++ b/docs/source/user_guide/using_datasets.rst @@ -59,6 +59,67 @@ have a bad time: dataset4 = fo.load_dataset("my_fourth_dataset") # DoesNotExistError: Dataset 'my_fourth_dataset' not found +.. _dataset-media-type: + +Dataset media type +------------------ + +The media type of a dataset is determined by the +:ref:`media type ` of the |Sample| objects that it contains. + +The :meth:`media_type ` property of a +dataset is set based on the first sample added to it: + +.. code-block:: python + :linenos: + + import fiftyone as fo + + dataset = fo.Dataset() + + print(dataset.media_type) + # None + + sample = fo.Sample(filepath="/path/to/image.png") + dataset.add_sample(sample) + + print(dataset.media_type) + # "image" + +Note that datasets are homogeneous; they must contain samples of the same media +type (except for :ref:`grouped datasets `): + +.. code-block:: python + :linenos: + + sample = fo.Sample(filepath="/path/to/video.mp4") + dataset.add_sample(sample) + # MediaTypeError: Sample media type 'video' does not match dataset media type 'image' + +The following media types are available: + +.. table:: + :widths: 25, 75 + + +---------------+---------------------------------------------------+ + | Media type | Description | + +===============+===================================================+ + | `image` | Datasets that contain | + | | :ref:`images ` | + +---------------+---------------------------------------------------+ + | `video` | Datasets that contain | + | | :ref:`videos ` | + +---------------+---------------------------------------------------+ + | `3d` | Datasets that contain | + | | :ref:`3D scenes <3d-datasets>` | + +---------------+---------------------------------------------------+ + | `point-cloud` | Datasets that contain | + | | :ref:`point clouds ` | + +---------------+---------------------------------------------------+ + | `group` | Datasets that contain | + | | :ref:`grouped data slices ` | + +---------------+---------------------------------------------------+ + .. _dataset-persistence: Dataset persistence @@ -108,50 +169,6 @@ shell and run the command again: you'll see that the `my_second_dataset` and `2020.08.04.12.36.29` datasets have been deleted because they were not persistent. -.. _dataset-media-type: - -Dataset media type ------------------- - -The media type of a dataset is determined by the -:ref:`media type ` of the |Sample| objects that it contains. - -The :meth:`media_type ` property of a -dataset is set based on the first sample added to it: - -.. code-block:: python - :linenos: - - import fiftyone as fo - - dataset = fo.Dataset() - - print(dataset.media_type) - # None - - dataset.add_sample(fo.Sample(filepath="/path/to/image.png")) - - print(dataset.media_type) - # "image" - -Datasets are homogeneous; they must contain samples of the same media type -(except for :ref:`grouped datasets `): - -.. code-block:: python - :linenos: - - dataset.add_sample(fo.Sample(filepath="/path/to/video.mp4")) - # MediaTypeError: Sample media type 'video' does not match dataset media type 'image' - -The following media types are possible: - -- `image`: if the dataset contains images -- `video`: if the dataset contains :ref:`videos ` -- `3d`: if the dataset contains :ref:`3D scenes <3d-datasets>` -- `point-cloud`: if the dataset contains - :ref:`point clouds ` -- `group`: if the dataset contains :ref:`grouped data slices ` - .. _dataset-version: Dataset version @@ -4092,6 +4109,63 @@ future sessions and manipulated as usual: }>, }> +.. _image-datasets: + +Image datasets +______________ + +Any |Sample| whose `filepath` is a file with MIME type `image/*` is recognized +as a image sample, and datasets composed of image samples have media type +`image`: + +.. code-block:: python + :linenos: + + import fiftyone as fo + + sample = fo.Sample(filepath="/path/to/image.png") + + dataset = fo.Dataset() + dataset.add_sample(sample) + + print(dataset.media_type) # image + print(sample) + +.. code-block:: text + + + +Example image dataset +--------------------- + +To get started exploring image datasets, try loading the +:ref:`quickstart dataset ` from the zoo: + +.. code:: python + :linenos: + + import fiftyone as fo + import fiftyone.zoo as foz + + dataset = foz.load_zoo_dataset("quickstart") + + print(dataset.count("ground_truth.detections")) # 1232 + print(dataset.count("predictions.detections")) # 5620 + print(dataset.count_values("ground_truth.detections.label")) + # {'dog': 15, 'airplane': 24, 'dining table': 15, 'hot dog': 5, ...} + + session = fo.launch_app(dataset) + +.. image:: /images/datasets/quickstart.gif + :alt: quickstart + :align: center + .. _video-datasets: Video datasets @@ -4300,7 +4374,7 @@ Example video dataset --------------------- To get started exploring video datasets, try loading the -:ref:`quickstart-video ` dataset from the zoo: +:ref:`quickstart-video dataset ` from the zoo: .. code:: python :linenos: @@ -4310,8 +4384,6 @@ To get started exploring video datasets, try loading the dataset = foz.load_zoo_dataset("quickstart-video") - print(dataset) - print(dataset.count("frames")) # 1279 print(dataset.count("frames.detections.detections")) # 11345 print(dataset.count_values("frames.detections.detections.label")) @@ -4319,22 +4391,9 @@ To get started exploring video datasets, try loading the session = fo.launch_app(dataset) -.. code-block:: text - - Name: quickstart-video - Media type: video - Num samples: 10 - Persistent: False - Tags: [] - Sample fields: - id: fiftyone.core.fields.ObjectIdField - filepath: fiftyone.core.fields.StringField - tags: fiftyone.core.fields.ListField(fiftyone.core.fields.StringField) - metadata: fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.VideoMetadata) - Frame fields: - id: fiftyone.core.fields.ObjectIdField - frame_number: fiftyone.core.fields.FrameNumberField - detections: fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.labels.Detections) +.. image:: /images/datasets/quickstart-video.gif + :alt: quickstart-video + :align: center .. _3d-datasets: @@ -4394,6 +4453,23 @@ serializes the scene into an FO3D file. print(dataset.media_type) # 3d +To modify an exising scene, load it via +:meth:`Scene.from_fo3d() `, perform any +necessary updates, and then re-write it to disk: + +.. code-block:: python + :linenos: + + import fiftyone as fo + + scene = fo.Scene.from_fo3d("/path/to/scene.fo3d") + + for node in scene.traverse(): + if isinstance(node, fo.SphereGeometry): + node.visible = False + + scene.write("/path/to/scene.fo3d") + .. _3d-meshes: 3D meshes @@ -4606,19 +4682,45 @@ to generate orthographic projection images of each scene: import fiftyone.zoo as foz # Load an example 3D dataset - dataset = ( - foz.load_zoo_dataset("quickstart-groups") - .select_group_slices("pcd") - .clone() + dataset = foz.load_zoo_dataset("quickstart-3d") + + # This dataset already has orthographic projections populated, but let's + # recompute them to demonstrate the idea + fou3d.compute_orthographic_projection_images( + dataset, + (-1, 512), # (width, height) of each image; -1 means aspect-preserving + bounds=((-50, -50, -50), (50, 50, 50)), + projection_normal=(0, -1, 0), + output_dir="/tmp/quickstart-3d-proj", + shading_mode="height", ) + session = fo.launch_app(dataset) + +Note that the method also supports :ref:`grouped datasets ` that +contain 3D slice(s): + +.. code-block:: python + :linenos: + + import fiftyone as fo + import fiftyone.utils.utils3d as fou3d + import fiftyone.zoo as foz + + # Load an example group dataset that contains a 3D slice + dataset = foz.load_zoo_dataset("quickstart-groups") + # Populate orthographic projections fou3d.compute_orthographic_projection_images(dataset, (-1, 512), "/tmp/proj") + dataset.group_slice = "pcd" session = fo.launch_app(dataset) .. note:: + Orthographic projection images currently only include point clouds, not + meshes or 3D shapes. + If a scene contains multiple :ref:`point clouds <3d-point-clouds>`, you can control which point cloud to project by initializing it with `flag_for_projection=True`. @@ -4632,15 +4734,34 @@ Refer to the :func:`compute_orthographic_projection_images() ` documentation for available parameters to customize the projections. -.. _example-3d-dataset: +.. _example-3d-datasets: -Example 3D dataset ------------------- +Example 3D datasets +------------------- To get started exploring 3D datasets, try loading the -:ref:`quickstart-groups ` dataset from the zoo -and :ref:`clone ` the point cloud slice into a -standalone dataset: +:ref:`quickstart-3d dataset ` from the zoo: + +.. code:: python + :linenos: + + import fiftyone as fo + import fiftyone.zoo as foz + + dataset = foz.load_zoo_dataset("quickstart-3d") + + print(dataset.count_values("ground_truth.label")) + # {'bottle': 5, 'stairs': 5, 'keyboard': 5, 'car': 5, ...} + + session = fo.launch_app(dataset) + +.. image:: /images/datasets/quickstart-3d.gif + :alt: quickstart-3d + :align: center + +Also check out the +:ref:`quickstart-groups dataset `, which +contains a point cloud slice: .. code:: python :linenos: @@ -4649,38 +4770,21 @@ standalone dataset: import fiftyone.utils.utils3d as fou3d import fiftyone.zoo as foz - dataset = ( - foz.load_zoo_dataset("quickstart-groups") - .select_group_slices("pcd") - .clone() - ) + dataset = foz.load_zoo_dataset("quickstart-groups") # Populate orthographic projections fou3d.compute_orthographic_projection_images(dataset, (-1, 512), "/tmp/proj") - print(dataset) - print(dataset.count("ground_truth.detections")) # 1100 print(dataset.count_values("ground_truth.detections.label")) # {'Pedestrian': 133, 'Car': 774, ...} + dataset.group_slice = "pcd" session = fo.launch_app(dataset) -.. code-block:: text - - Name: 2024.04.13.15.21.08 - Media type: 3d - Num samples: 200 - Persistent: False - Tags: [] - Sample fields: - id: fiftyone.core.fields.ObjectIdField - filepath: fiftyone.core.fields.StringField - tags: fiftyone.core.fields.ListField(fiftyone.core.fields.StringField) - metadata: fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.Metadata) - group: fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.groups.Group) - ground_truth: fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.labels.Detections) - orthographic_projection_metadata: fiftyone.core.fields.EmbeddedDocumentField(fiftyone.utils.utils3d.OrthographicProjectionMetadata) +.. image:: /images/datasets/quickstart-groups.gif + :alt: quickstart-groups + :align: center .. _point-cloud-datasets: diff --git a/fiftyone/core/threed/scene_3d.py b/fiftyone/core/threed/scene_3d.py index 82d94ded6f..1eec4993b1 100644 --- a/fiftyone/core/threed/scene_3d.py +++ b/fiftyone/core/threed/scene_3d.py @@ -113,6 +113,29 @@ def _from_dict(d: dict): class Scene(Object3D): """Represents a scene graph which contains a hierarchy of 3D objects. + Example usage:: + + import fiftyone as fo + + scene = fo.Scene() + + obj_mesh = fo.ObjMesh( + "obj_mesh_name", "/path/to/mesh.obj", mtl_path="/path/to/mesh.mtl" + ) + gltf_mesh = fo.GltfMesh("gltf_mesh_name", "/path/to/mesh.gltf") + pcd = fo.PointCloud("pcd_name", "/path/to/points.pcd") + + scene.add(obj_mesh) + scene.add(gltf_mesh) + scene.add(pcd) + + scene.write("/path/to/scene.fo3d") + + sample = fo.Sample("/path/to/scene.fo3d") + + dataset = fo.Dataset() + dataset.add_sample(sample) + Args: camera (None): the default camera of the scene. If ``None``, a default :class:`fiftyone.core.threed.PerspectiveCamera` is created with @@ -122,27 +145,6 @@ class Scene(Object3D): directional lights placed at different angles around the scene background (None): the background for the scene. May be a color, image, or a skybox - - Usage:: - - scene = Scene() - - obj_mesh = ObjMesh( - "obj_mesh_name", "/path/to/obj", mtl_path="/path/to/mtl" - ) - gltf_mesh = GltfMesh("gltf_mesh_name", "/path/to/gltf") - pcd = PointCloud("pcd_name", "/path/to/pcd") - - scene.add(obj_mesh) - scene.add(gltf_mesh) - scene.add(pcd) - - scene.write("/path/to/scene.fo3d") - - dataset = fo.Dataset() - dataset.add_sample(fo.Sample("/path/to/scene.fo3d")) - - assert dataset.media_type == "3d" """ def __init__( @@ -161,7 +163,6 @@ def __init__( self.background = background def __repr__(self): - """Return a string representation of the scene.""" nodes_summary = self.get_scene_summary() repr_str = "fo3d scene with " asset_detected = False @@ -249,9 +250,8 @@ def traverse(self, include_self=False): Args: include_self: whether to include the current node in the traversal - Yields: - :class:`Object3D` - + Returns: + a generator that yields :class:`Object3D` instances """ if include_self: yield self @@ -313,8 +313,12 @@ def get_scene_summary(self): } def get_asset_paths(self): - """Collect all asset paths in the scene. Asset paths aren't resolved to - absolute paths. + """Returns a list of all asset paths in the scene. + + Note that any relative asset paths are not resolved to absolute paths. + + Returns: + a list of asset paths """ asset_paths = list( itertools.chain.from_iterable( @@ -379,7 +383,14 @@ def _from_fo3d_dict(dict_data: dict): @staticmethod def from_fo3d(path: str): - """Load a scene from a ``.fo3d`` file.""" + """Loads a scene from an FO3D file. + + Args: + path: the path to an ``.fo3d`` file + + Returns: + a :class:`Scene` + """ if not path.endswith(".fo3d"): raise ValueError("Scene must be loaded from a .fo3d file") diff --git a/fiftyone/utils/utils3d.py b/fiftyone/utils/utils3d.py index ec5d696def..a9ed6b894b 100644 --- a/fiftyone/utils/utils3d.py +++ b/fiftyone/utils/utils3d.py @@ -426,6 +426,8 @@ class OrthographicProjectionMetadata(DynamicEmbeddedDocument, fol._HasMedia): plane max_bound (None): the ``[xmax, ymax]`` of the image in the projection plane + normal (None): the normal vector of the plane onto which the projection + was performed. If not specified, ``[0, 0, 1]`` is assumed width: the width of the image, in pixels height: the height of the image, in pixels """ @@ -435,6 +437,7 @@ class OrthographicProjectionMetadata(DynamicEmbeddedDocument, fol._HasMedia): filepath = fof.StringField() min_bound = fof.ListField(fof.FloatField()) max_bound = fof.ListField(fof.FloatField()) + normal = fof.ListField(fof.FloatField(), default=None) width = fof.IntField() height = fof.IntField() @@ -532,6 +535,8 @@ def compute_orthographic_projection_images( subsampling_rate=None, projection_normal=None, bounds=None, + padding=None, + overwrite=False, skip_failures=False, progress=None, ): @@ -597,6 +602,12 @@ def compute_orthographic_projection_images( to generate each map. Either element of the tuple or any/all of its values can be None, in which case a tight crop of the point cloud along the missing dimension(s) are used + padding (None): a relative padding(s) in ``[0, 1]]`` to apply to the + field of view bounds prior to projection. Can either be a single + value to apply in all directions or a ``[padx, pady, padz]``. For + example, pass ``padding=0.25`` with no ``bounds`` to project onto + a tight crop of each point cloud with 25% padding around it + overwrite (False): whether to overwrite existing images skip_failures (False): whether to gracefully continue without raising an error if a projection fails progress (None): whether to render a progress bar (True/False), use the @@ -622,7 +633,7 @@ def compute_orthographic_projection_images( out_samples = [] filename_maker = fou.UniqueFilenameMaker( - output_dir=output_dir, rel_dir=rel_dir + output_dir=output_dir, rel_dir=rel_dir, ignore_existing=overwrite ) for sample in view.iter_samples(autosave=True, progress=progress): @@ -644,6 +655,7 @@ def compute_orthographic_projection_images( subsampling_rate=subsampling_rate, projection_normal=projection_normal, bounds=bounds, + padding=padding, ) except Exception as e: if not skip_failures: @@ -677,6 +689,7 @@ def compute_orthographic_projection_image( subsampling_rate=None, projection_normal=None, bounds=None, + padding=None, ): """Generates an orthographic projection image for the given PCD file onto the specified plane (default xy plane). @@ -711,6 +724,11 @@ def compute_orthographic_projection_image( the projected plane. Either element of the tuple or any/all of its values can be None, in which case a tight crop of the point cloud along the missing dimension(s) are used + padding (None): a relative padding(s) in ``[0, 1]]`` to apply to the + field of view bounds prior to projection. Can either be a single + value to apply in all directions or a ``[padx, pady, padz]``. For + example, pass ``padding=0.25`` with no ``bounds`` to project onto + a tight crop of the point cloud with 25% padding around it Returns: a tuple of @@ -728,6 +746,7 @@ def compute_orthographic_projection_image( filepath, size=size, bounds=bounds, + padding=padding, projection_normal=projection_normal, subsampling_rate=subsampling_rate, ) @@ -825,6 +844,7 @@ def _parse_point_cloud( filepath, size=None, bounds=None, + padding=None, projection_normal=None, subsampling_rate=None, ): @@ -857,6 +877,9 @@ def _parse_point_cloud( ].as_matrix() pc = pc.rotate(R, center=[0, 0, 0]) + if projection_normal is None: + projection_normal = [0, 0, 1] + if bounds is None: min_bound, max_bound = None, None else: @@ -870,19 +893,24 @@ def _parse_point_cloud( _max_bound = pc.get_max_bound() max_bound = _fill_none(max_bound, _max_bound) + min_bound = np.asarray(min_bound, dtype=float) + max_bound = np.asarray(max_bound, dtype=float) + + if padding is not None: + pad = 0.5 * np.asarray(padding) * (max_bound - min_bound) + min_bound -= pad + max_bound += pad + # Ensure bbox will not have 0 volume by adding a small value if max_bound - # and min_bound are close to each other - delta = np.isclose( - np.asarray(max_bound) - np.asarray(min_bound), 0 - ) * np.repeat(0.000001, 3) - max_bound += delta + # and min_bound are close to each other + max_bound += 1e-6 * np.isclose(max_bound - min_bound, 0) bbox = o3d.geometry.AxisAlignedBoundingBox( min_bound=min_bound, max_bound=max_bound ) - # crop bounds and translate so that min bound is at the origin - pc = pc.crop(bbox).translate((-min_bound[0], -min_bound[1], -min_bound[2])) + # Crop bounds and translate so that min bound is at the origin + pc = pc.crop(bbox).translate(-min_bound) if subsampling_rate is not None and subsampling_rate > 0: pc = pc.uniform_down_sample(subsampling_rate) @@ -896,8 +924,9 @@ def _parse_point_cloud( width, height = None, None metadata = OrthographicProjectionMetadata( - min_bound=min_bound, - max_bound=max_bound, + min_bound=list(min_bound), + max_bound=list(max_bound), + normal=list(projection_normal), width=width, height=height, ) diff --git a/fiftyone/zoo/datasets/base.py b/fiftyone/zoo/datasets/base.py index deeb3a3ccd..259105f4ee 100644 --- a/fiftyone/zoo/datasets/base.py +++ b/fiftyone/zoo/datasets/base.py @@ -3179,6 +3179,64 @@ def _patch_if_necessary(self, dataset_dir, _): etau.delete_dir(scratch_dir) +class Quickstart3DDataset(FiftyOneDataset): + """A small 3D dataset with meshes, point clouds, and oriented bounding + boxes. + + The dataset consists of 200 3D mesh samples from the test split of the + `ModelNet40 `_ dataset, with point + clouds generated using a Poisson disk sampling method, and oriented + bounding boxes generated based on the convex hull. + + Objects have been rescaled and recentered from the original dataset. + + Example usage:: + + import fiftyone as fo + import fiftyone.zoo as foz + + dataset = foz.load_zoo_dataset("quickstart-3d") + + session = fo.launch_app(dataset) + + Dataset size + 215.7 MB + """ + + _GDRIVE_ID = "1EnQ2-gGDktEd8pAWwdXNK-FeHUFTFl5K" + _ARCHIVE_NAME = "quickstart-3d.zip" + _DIR_IN_ARCHIVE = "quickstart-3d" + + @property + def name(self): + return "quickstart-3d" + + @property + def tags(self): + return ("3d", "point-cloud", "mesh", "quickstart") + + @property + def supported_splits(self): + return None + + def _download_and_prepare(self, dataset_dir, scratch_dir, _): + _download_and_extract_archive( + self._GDRIVE_ID, + self._ARCHIVE_NAME, + self._DIR_IN_ARCHIVE, + dataset_dir, + scratch_dir, + ) + + logger.info("Parsing dataset metadata") + dataset_type = fot.FiftyOneDataset() + importer = foud.FiftyOneDatasetImporter + num_samples = importer._get_num_samples(dataset_dir) + logger.info("Found %d samples", num_samples) + + return dataset_type, num_samples, None + + class UCF101Dataset(FiftyOneDataset): """UCF101 is an action recognition data set of realistic action videos, collected from YouTube, having 101 action categories. This data set is an @@ -3289,6 +3347,7 @@ def _download_and_prepare(self, dataset_dir, scratch_dir, split): "quickstart-geo": QuickstartGeoDataset, "quickstart-video": QuickstartVideoDataset, "quickstart-groups": QuickstartGroupsDataset, + "quickstart-3d": Quickstart3DDataset, "sama-coco": SamaCOCODataset, "ucf101": UCF101Dataset, } diff --git a/tests/unittests/utils3d_tests.py b/tests/unittests/utils3d_tests.py index fbe517ae65..e27b49ced0 100644 --- a/tests/unittests/utils3d_tests.py +++ b/tests/unittests/utils3d_tests.py @@ -285,6 +285,7 @@ def test_orthographic_projection_metadata_field(self): metadata.filepath = "test_path" metadata.min_bound = (1, 2, 3) metadata.max_bound = (4, 5, 6) + metadata.normal = (0, 0, 1) metadata.width = 100 metadata.height = 100 @@ -304,6 +305,7 @@ def test_orthographic_projection_metadata_field(self): # tuples after deserialized are converted into np arrays self.assertTrue(np.array_equal(field["min_bound"], (1, 2, 3))) self.assertTrue(np.array_equal(field["max_bound"], (4, 5, 6))) + self.assertTrue(np.array_equal(field["normal"], (0, 0, 1))) self.assertEqual(field["width"], 100) self.assertEqual(field["height"], 100)