From a609e93ed4f4e1ed5541151099dc67af94b0d98d Mon Sep 17 00:00:00 2001 From: Yifan Lareina WU Date: Thu, 10 Aug 2023 13:27:04 +0800 Subject: [PATCH] [Docs] Documentation regarding 3d pose (#2599) --- docs/en/guide_to_framework.md | 33 +++++++++++------ docs/en/migration.md | 10 ++++++ docs/zh_cn/guide_to_framework.md | 36 ++++++++++++------- docs/zh_cn/migration.md | 4 +++ mmpose/codecs/image_pose_lifting.py | 22 ++++++------ mmpose/codecs/motionbert_label.py | 10 +++--- mmpose/codecs/video_pose_lifting.py | 22 ++++++------ mmpose/datasets/transforms/formatting.py | 2 +- .../motion_regression_head.py | 8 ++--- .../temporal_regression_head.py | 8 ++--- tests/test_codecs/test_image_pose_lifting.py | 4 +-- tests/test_codecs/test_motionbert_label.py | 2 +- tests/test_codecs/test_video_pose_lifting.py | 10 +++--- 13 files changed, 104 insertions(+), 67 deletions(-) diff --git a/docs/en/guide_to_framework.md b/docs/en/guide_to_framework.md index 5f743d5bf7..953aa589b3 100644 --- a/docs/en/guide_to_framework.md +++ b/docs/en/guide_to_framework.md @@ -198,6 +198,14 @@ test_dataloader = val_dataloader To use custom dataset in MMPose, we recommend converting the annotations into a supported format (e.g. COCO or MPII) and directly using our implementation of the corresponding dataset. If this is not applicable, you may need to implement your own dataset class. +More details about using custom datasets can be found in [Customize Datasets](./advanced_guides/customize_datasets.md). + +```{note} +If you wish to inherit from the `BaseDataset` provided by [MMEngine](https://github.com/open-mmlab/mmengine). Please refer to this [documents](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html) for details. +``` + +#### 2D Dataset + Most 2D keypoint datasets in MMPose **organize the annotations in a COCO-like style**. Thus we provide a base class [BaseCocoStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_coco_style_dataset.py) for these datasets. We recommend that users subclass [BaseCocoStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_coco_style_dataset.py) and override the methods as needed (usually `__init__()` and `_load_annotations()`) to extend to a new custom 2D keypoint dataset. ```{note} @@ -278,11 +286,9 @@ class CrowdPoseDataset(BaseCocoStyleDataset): For COCO-style datasets, we only need to inherit from [BaseCocoStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_coco_style_dataset.py) and specify `METAINFO`, then the dataset class is ready to use. -More details about using custom datasets can be found in [Customize Datasets](./advanced_guides/customize_datasets.md). +#### 3D Dataset -```{note} -If you wish to inherit from the `BaseDataset` provided by [MMEngine](https://github.com/open-mmlab/mmengine). Please refer to this [documents](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html) for details. -``` +we provide a base class [BaseMocapDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_mocap_dataset.py) for 3D datasets. We recommend that users subclass [BaseMocapDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_mocap_dataset.py) and override the methods as needed (usually `__init__()` and `_load_annotations()`) to extend to a new custom 3D keypoint dataset. ### Pipeline @@ -310,7 +316,7 @@ test_pipeline = [ In a keypoint detection task, data will be transformed among three scale spaces: -- **Original Image Space**: the space where the images are stored. The sizes of different images are not necessarily the same +- **Original Image Space**: the space where the original images and annotations are stored. The sizes of different images are not necessarily the same - **Input Image Space**: the image space used for model input. All **images** and **annotations** will be transformed into this space, such as `256x256`, `256x192`, etc. @@ -326,9 +332,9 @@ In MMPose, the modules used for data transformation are under [$MMPOSE/mmpose/da #### i. Augmentation -Commonly used transforms are defined in [$MMPOSE/mmpose/datasets/transforms/common_transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/common_transforms.py), such as [RandomFlip](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L94), [RandomHalfBody](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L263), etc. +Commonly used transforms are defined in [$MMPOSE/mmpose/datasets/transforms/common_transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/common_transforms.py), such as [RandomFlip](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L94), [RandomHalfBody](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L263), etc. For top-down methods, `Shift`, `Rotate`and `Resize` are implemented by [RandomBBoxTransform](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L433). For bottom-up methods, [BottomupRandomAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/bottomup_transforms.py#L134) is used. -For top-down methods, `Shift`, `Rotate`and `Resize` are implemented by [RandomBBoxTransform](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L433). For bottom-up methods, [BottomupRandomAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/bottomup_transforms.py#L134) is used. +Transforms for 3d pose data are defined in [$MMPOSE/mmpose/datasets/transforms/pose3d_transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/pose3d_transforms.py) ```{note} Most data transforms depend on `bbox_center` and `bbox_scale`, which can be obtained by [GetBBoxCenterScale](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L31). @@ -336,7 +342,9 @@ Most data transforms depend on `bbox_center` and `bbox_scale`, which can be obta #### ii. Transformation -Affine transformation is used to convert images and annotations from the original image space to the input space. This is done by [TopdownAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/topdown_transforms.py#L14) for top-down methods and [BottomupRandomAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/bottomup_transforms.py#L134) for bottom-up methods. +For 2D image inputs, affine transformation is used to convert images and annotations from the original image space to the input space. This is done by [TopdownAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/topdown_transforms.py#L14) for top-down methods and [BottomupRandomAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/bottomup_transforms.py#L134) for bottom-up methods. + +For pose lifting tasks, transformation is merged into [Encoding](./guide_to_framework.md#iii-encoding). #### iii. Encoding @@ -351,6 +359,7 @@ Currently we support the following types of Targets. - `keypoint_xy_label`: axis-wise keypoint representation - `heatmap+keypoint_label`: Gaussian heatmaps and keypoint representation - `multiscale_heatmap`: multi-scale Gaussian heatmaps +- `lifting_target_label`: 3D lifting target keypoint representation and the generated targets will be packed as follows. @@ -359,16 +368,18 @@ and the generated targets will be packed as follows. - `keypoint_x_labels`: keypoint x-axis representation - `keypoint_y_labels`: keypoint y-axis representation - `keypoint_weights`: keypoint visibility and weights +- `lifting_target_label`: 3D lifting target representation +- `lifting_target_weight`: 3D lifting target visibility and weights -Note that we unify the data format of top-down and bottom-up methods, which means that a new dimension is added to represent different instances from the same image, in shape: +Note that we unify the data format of top-down, pose-lifting and bottom-up methods, which means that a new dimension is added to represent different instances from the same image, in shape: ```Python [batch_size, num_instances, num_keypoints, dim_coordinates] ``` -- top-down: `[B, 1, K, D]` +- top-down and pose-lifting: `[B, 1, K, D]` -- Bottom-up: `[B, N, K, D]` +- bottom-up: `[B, N, K, D]` The provided codecs are stored under [$MMPOSE/mmpose/codecs](https://github.com/open-mmlab/mmpose/tree/main/mmpose/codecs). diff --git a/docs/en/migration.md b/docs/en/migration.md index 70ed0b5a52..a3e0099bc7 100644 --- a/docs/en/migration.md +++ b/docs/en/migration.md @@ -111,6 +111,16 @@ class GenerateTarget(BaseTransform): The data normalization operations `NormalizeTensor` and `ToTensor` will be replaced by **DataPreprocessor** module, which will no longer be used as a preprocessing operation, but will be merged as a part of the model forward propagation. +The 3D normalization methods like + +- `GetRootCenteredPose` +- `ImageCoordinateNormalization` +- `NormalizeJointCoordinate` + +will be merged into codecs, for example [`ImagePoseLifting`](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/image_pose_lifting.py#L11) and [`VideoPoseLifting`](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/video_pose_lifting.py#L13). + +The data conversion and reshaping operation `PoseSequenceToTensor` will be implemented in corresponding codecs and [`PackPoseInputs`](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/formatting.py). + ## Compatibility of Models We have performed compatibility with the model weights provided by model zoo to ensure that the same model weights can get a comparable accuracy in both version. But note that due to the large number of differences in processing details, the inference outputs can be slightly different(less than 0.05% difference in accuracy). diff --git a/docs/zh_cn/guide_to_framework.md b/docs/zh_cn/guide_to_framework.md index 3b85a9fc6b..4b9babc60d 100644 --- a/docs/zh_cn/guide_to_framework.md +++ b/docs/zh_cn/guide_to_framework.md @@ -201,11 +201,18 @@ test_dataloader = val_dataloader 在 MMPose 中使用自定义数据集时,我们推荐将数据转化为已支持的格式(如 COCO 或 MPII),并直接使用我们提供的对应数据集实现。如果这种方式不可行,则用户需要实现自己的数据集类。 +更多自定义数据集的使用方式,请前往 [【进阶教程 - 自定义数据集】](./advanced_guides/customize_datasets.md)。 + +````{note} +如果你需要直接继承 [MMEngine](https://github.com/open-mmlab/mmengine) 中提供的 `BaseDataset` 基类。具体方法请参考相关[文档](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html) + + +#### 2D 数据集 MMPose 中的大部分 2D 关键点数据集**以 COCO 形式组织**,为此我们提供了基类 [BaseCocoStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_coco_style_dataset.py)。我们推荐用户继承该基类,并按需重写它的方法(通常是 `__init__()` 和 `_load_annotations()` 方法),以扩展到新的 2D 关键点数据集。 ```{note} 关于COCO数据格式的详细说明请参考 [COCO](./dataset_zoo/2d_body_keypoint.md) 。 -``` +```` 在 MMPose 中 bbox 的数据格式采用 `xyxy`,而不是 `xywh`,这与 [MMDetection](https://github.com/open-mmlab/mmdetection) 等其他 OpenMMLab 成员保持一致。为了实现不同 bbox 格式之间的转换,我们提供了丰富的函数:`bbox_xyxy2xywh`、`bbox_xywh2xyxy`、`bbox_xyxy2cs`等。这些函数定义在 [$MMPOSE/mmpose/structures/bbox/transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/structures/bbox/transforms.py)。 @@ -281,11 +288,11 @@ class CrowdPoseDataset(BaseCocoStyleDataset): 对于使用 COCO 格式标注的数据集,只需要继承 [BaseCocoStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_coco_style_dataset.py) 并指定 `METAINFO`,就可以十分轻松地集成到 MMPose 中参与训练。 -更多自定义数据集的使用方式,请前往 [【进阶教程 - 自定义数据集】](./advanced_guides/customize_datasets.md)。 +```` -```{note} -如果你需要直接继承 [MMEngine](https://github.com/open-mmlab/mmengine) 中提供的 `BaseDataset` 基类。具体方法请参考相关[文档](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html) -``` + +#### 3D 数据集 +我们提供了基类 [BaseMocapStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_mocap_dataset.py)。我们推荐用户继承该基类,并按需重写它的方法(通常是 `__init__()` 和 `_load_annotations()` 方法),以扩展到新的 2D 关键点数据集。 ### 数据流水线 @@ -309,7 +316,7 @@ test_pipeline = [ dict(type='TopdownAffine', input_size=codec['input_size']), dict(type='PackPoseInputs') ] -``` +```` 在关键点检测任务中,数据一般会在三个尺度空间中变换: @@ -329,9 +336,9 @@ test_pipeline = [ #### i. 数据增强 -数据增强中常用的变换存放在 [$MMPOSE/mmpose/datasets/transforms/common_transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/common_transforms.py) 中,如 [RandomFlip](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L94)、[RandomHalfBody](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L263) 等。 +数据增强中常用的变换存放在 [$MMPOSE/mmpose/datasets/transforms/common_transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/common_transforms.py) 中,如 [RandomFlip](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L94)、[RandomHalfBody](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L263) 等。对于 top-down 方法,`Shift`、`Rotate`、`Resize` 操作由 [RandomBBoxTransform](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L433) 来实现;对于 bottom-up 方法,这些则是由 [BottomupRandomAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/bottomup_transforms.py#L134) 实现。 -对于 top-down 方法,`Shift`、`Rotate`、`Resize` 操作由 [RandomBBoxTransform](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L433) 来实现;对于 bottom-up 方法,这些则是由 [BottomupRandomAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/bottomup_transforms.py#L134) 实现。 +3D 姿态数据的变换存放在 [$MMPOSE/mmpose/datasets/transforms/pose3d_transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/pose3d_transforms.py) 中。 ```{note} 值得注意的是,大部分数据变换都依赖于 `bbox_center` 和 `bbox_scale`,它们可以通过 [GetBBoxCenterScale](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L31) 来得到。 @@ -339,7 +346,9 @@ test_pipeline = [ #### ii. 数据变换 -我们使用仿射变换,将图像和坐标标注从原始图片空间变换到输入图片空间。这一操作在 top-down 方法中由 [TopdownAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/topdown_transforms.py#L14) 完成,在 bottom-up 方法中则由 [BottomupRandomAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/bottomup_transforms.py#L134) 完成。 +对于二维图片输入,我们使用仿射变换,将图像和坐标标注从原始图片空间变换到输入图片空间。这一操作在 top-down 方法中由 [TopdownAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/topdown_transforms.py#L14) 完成,在 bottom-up 方法中则由 [BottomupRandomAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/bottomup_transforms.py#L134) 完成。 + +对于 3D 姿态提升任务,变换被合并进[数据编码](./guide_to_framework.md#iii-数据编码)。 #### iii. 数据编码 @@ -354,6 +363,7 @@ test_pipeline = [ - `keypoint_xy_label`: 单个坐标轴关键点标签 - `heatmap+keypoint_label`: 同时生成高斯热图和关键点标签 - `multiscale_heatmap`: 多尺度高斯热图 +- `lifting_target_label`: 3D 提升目标的关键点标签 生成的监督目标会按以下关键字进行封装: @@ -362,6 +372,8 @@ test_pipeline = [ - `keypoint_x_labels`:x 轴关键点标签 - `keypoint_y_labels`:y 轴关键点标签 - `keypoint_weights`:关键点权重 +- `lifting_target_label`: 3D 提升目标的关键点标签 +- `lifting_target_weight`: 3D 提升目标的关键点权重 ```Python @TRANSFORMS.register_module() @@ -377,15 +389,15 @@ class GenerateTarget(BaseTransform): """ ``` -值得注意的是,我们对 top-down 和 bottom-up 的数据格式进行了统一,这意味着标注信息中会新增一个维度来代表同一张图里的不同目标(如人),格式为: +值得注意的是,我们对 top-down,pose-lifting 和 bottom-up 的数据格式进行了统一,这意味着标注信息中会新增一个维度来代表同一张图里的不同目标(如人),格式为: ```Python [batch_size, num_instances, num_keypoints, dim_coordinates] ``` -- top-down:`[B, 1, K, D]` +- top-down 和 pose-lifting:`[B, 1, K, D]` -- Bottom-up: `[B, N, K, D]` +- bottom-up: `[B, N, K, D]` 当前已经支持的编解码器定义在 [$MMPOSE/mmpose/codecs](https://github.com/open-mmlab/mmpose/tree/main/mmpose/codecs) 目录下,如果你需要自定新的编解码器,可以前往[编解码器](./user_guides/codecs.md)了解更多详情。 diff --git a/docs/zh_cn/migration.md b/docs/zh_cn/migration.md index 9a591dfcc9..b30ed4d680 100644 --- a/docs/zh_cn/migration.md +++ b/docs/zh_cn/migration.md @@ -102,6 +102,10 @@ class GenerateTarget(BaseTransform): 旧版的数据归一化操作 `NormalizeTensor` 和 `ToTensor` 方法将由 **DataPreprocessor** 模块替代,不再作为流水线的一部分,而是作为模块加入到模型前向传播中。 +旧版用于 3D 人类姿态数据变换的方法 `GetRootCenteredPose`, `ImageCoordinateNormalization` 和 `NormalizeJointCoordinate` 等,将被合并入编码器,比如 [`ImagePoseLifting`](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/image_pose_lifting.py#L11) 和 [`VideoPoseLifting`](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/video_pose_lifting.py#L13) 等。 + +数据转换和重构操作 `PoseSequenceToTensor` 将在相应的编解码器和 [`PackPoseInputs`](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/formatting.py) 中实现。 + ## 模型兼容 我们对 model zoo 提供的模型权重进行了兼容性处理,确保相同的模型权重测试精度能够与 0.x 版本保持同等水平,但由于在这两个版本中存在大量处理细节的差异,推理结果可能会产生轻微的不同(精度误差小于 0.05%)。 diff --git a/mmpose/codecs/image_pose_lifting.py b/mmpose/codecs/image_pose_lifting.py index a5efee99c1..1a2ed8a498 100644 --- a/mmpose/codecs/image_pose_lifting.py +++ b/mmpose/codecs/image_pose_lifting.py @@ -112,7 +112,7 @@ def encode(self, keypoints' weights in shape (N, K, ) or (N-1, K, ). - lifting_target_label: The processed target coordinate in shape (K, C) or (K-1, C). - - lifting_target_weights (np.ndarray): The target weights in + - lifting_target_weight (np.ndarray): The target weights in shape (K, ) or (K-1, ). - trajectory_weights (np.ndarray): The trajectory weights in shape (K, ). @@ -136,17 +136,17 @@ def encode(self, if lifting_target is None: lifting_target = [keypoints[0]] - # set initial value for `lifting_target_weights` + # set initial value for `lifting_target_weight` # and `trajectory_weights` if lifting_target_visible is None: lifting_target_visible = np.ones( lifting_target.shape[:-1], dtype=np.float32) - lifting_target_weights = lifting_target_visible + lifting_target_weight = lifting_target_visible trajectory_weights = (1 / lifting_target[:, 2]) else: valid = lifting_target_visible > 0.5 - lifting_target_weights = np.where(valid, 1., 0.).astype(np.float32) - trajectory_weights = lifting_target_weights + lifting_target_weight = np.where(valid, 1., 0.).astype(np.float32) + trajectory_weights = lifting_target_weight encoded = dict() @@ -164,14 +164,14 @@ def encode(self, lifting_target_label, self.root_index, axis=-2) lifting_target_visible = np.delete( lifting_target_visible, self.root_index, axis=-2) - assert lifting_target_weights.ndim in { + assert lifting_target_weight.ndim in { 2, 3 - }, (f'lifting_target_weights.ndim {lifting_target_weights.ndim} ' + }, (f'lifting_target_weight.ndim {lifting_target_weight.ndim} ' 'is not in {2, 3}') - axis_to_remove = -2 if lifting_target_weights.ndim == 3 else -1 - lifting_target_weights = np.delete( - lifting_target_weights, self.root_index, axis=axis_to_remove) + axis_to_remove = -2 if lifting_target_weight.ndim == 3 else -1 + lifting_target_weight = np.delete( + lifting_target_weight, self.root_index, axis=axis_to_remove) # Add a flag to avoid latter transforms that rely on the root # joint or the original joint index encoded['target_root_removed'] = True @@ -222,7 +222,7 @@ def encode(self, encoded['keypoint_labels'] = keypoint_labels encoded['keypoint_labels_visible'] = keypoints_visible encoded['lifting_target_label'] = lifting_target_label - encoded['lifting_target_weights'] = lifting_target_weights + encoded['lifting_target_weight'] = lifting_target_weight encoded['trajectory_weights'] = trajectory_weights encoded['target_root'] = root diff --git a/mmpose/codecs/motionbert_label.py b/mmpose/codecs/motionbert_label.py index ddbda362ef..63058c2cf1 100644 --- a/mmpose/codecs/motionbert_label.py +++ b/mmpose/codecs/motionbert_label.py @@ -93,7 +93,7 @@ def encode(self, keypoints' weights in shape (N, K, ) or (N, K-1, ). - lifting_target_label: The processed target coordinate in shape (K, C) or (K-1, C). - - lifting_target_weights (np.ndarray): The target weights in + - lifting_target_weight (np.ndarray): The target weights in shape (K, ) or (K-1, ). - factor (np.ndarray): The factor mapping camera and image coordinate in shape (T, 1). @@ -104,14 +104,14 @@ def encode(self, if lifting_target is None: lifting_target = [keypoints[..., 0, :, :]] - # set initial value for `lifting_target_weights` + # set initial value for `lifting_target_weight` if lifting_target_visible is None: lifting_target_visible = np.ones( lifting_target.shape[:-1], dtype=np.float32) - lifting_target_weights = lifting_target_visible + lifting_target_weight = lifting_target_visible else: valid = lifting_target_visible > 0.5 - lifting_target_weights = np.where(valid, 1., 0.).astype(np.float32) + lifting_target_weight = np.where(valid, 1., 0.).astype(np.float32) if camera_param is None: camera_param = dict() @@ -170,7 +170,7 @@ def encode(self, encoded['keypoint_labels'] = keypoint_labels encoded['keypoint_labels_visible'] = keypoints_visible encoded['lifting_target_label'] = lifting_target_label - encoded['lifting_target_weights'] = lifting_target_weights + encoded['lifting_target_weight'] = lifting_target_weight encoded['lifting_target'] = lifting_target_label encoded['lifting_target_visible'] = lifting_target_visible encoded['factor'] = factor diff --git a/mmpose/codecs/video_pose_lifting.py b/mmpose/codecs/video_pose_lifting.py index a692e85806..ba8e4db69e 100644 --- a/mmpose/codecs/video_pose_lifting.py +++ b/mmpose/codecs/video_pose_lifting.py @@ -89,7 +89,7 @@ def encode(self, keypoints' weights in shape (N, K, ) or (N-1, K, ). - lifting_target_label: The processed target coordinate in shape (K, C) or (K-1, C). - - lifting_target_weights (np.ndarray): The target weights in + - lifting_target_weight (np.ndarray): The target weights in shape (K, ) or (K-1, ). - trajectory_weights (np.ndarray): The trajectory weights in shape (K, ). @@ -113,17 +113,17 @@ def encode(self, if lifting_target is None: lifting_target = [keypoints[0]] - # set initial value for `lifting_target_weights` + # set initial value for `lifting_target_weight` # and `trajectory_weights` if lifting_target_visible is None: lifting_target_visible = np.ones( lifting_target.shape[:-1], dtype=np.float32) - lifting_target_weights = lifting_target_visible + lifting_target_weight = lifting_target_visible trajectory_weights = (1 / lifting_target[:, 2]) else: valid = lifting_target_visible > 0.5 - lifting_target_weights = np.where(valid, 1., 0.).astype(np.float32) - trajectory_weights = lifting_target_weights + lifting_target_weight = np.where(valid, 1., 0.).astype(np.float32) + trajectory_weights = lifting_target_weight if camera_param is None: camera_param = dict() @@ -147,14 +147,14 @@ def encode(self, lifting_target_label, self.root_index, axis=-2) lifting_target_visible = np.delete( lifting_target_visible, self.root_index, axis=-2) - assert lifting_target_weights.ndim in { + assert lifting_target_weight.ndim in { 2, 3 }, (f'Got invalid lifting target weights shape ' - f'{lifting_target_weights.shape}') + f'{lifting_target_weight.shape}') - axis_to_remove = -2 if lifting_target_weights.ndim == 3 else -1 - lifting_target_weights = np.delete( - lifting_target_weights, + axis_to_remove = -2 if lifting_target_weight.ndim == 3 else -1 + lifting_target_weight = np.delete( + lifting_target_weight, self.root_index, axis=axis_to_remove) # Add a flag to avoid latter transforms that rely on the root @@ -203,7 +203,7 @@ def encode(self, encoded['keypoint_labels'] = keypoint_labels encoded['keypoints_visible'] = keypoints_visible encoded['lifting_target_label'] = lifting_target_label - encoded['lifting_target_weights'] = lifting_target_weights + encoded['lifting_target_weight'] = lifting_target_weight encoded['trajectory_weights'] = trajectory_weights return encoded diff --git a/mmpose/datasets/transforms/formatting.py b/mmpose/datasets/transforms/formatting.py index c2431c70bf..0bd88b87e0 100644 --- a/mmpose/datasets/transforms/formatting.py +++ b/mmpose/datasets/transforms/formatting.py @@ -122,7 +122,7 @@ class PackPoseInputs(BaseTransform): label_mapping_table = { 'keypoint_labels': 'keypoint_labels', 'lifting_target_label': 'lifting_target_label', - 'lifting_target_weights': 'lifting_target_weights', + 'lifting_target_weight': 'lifting_target_weight', 'trajectory_weights': 'trajectory_weights', 'keypoint_x_labels': 'keypoint_x_labels', 'keypoint_y_labels': 'keypoint_y_labels', diff --git a/mmpose/models/heads/regression_heads/motion_regression_head.py b/mmpose/models/heads/regression_heads/motion_regression_head.py index 3870e3c59e..2ad9497345 100644 --- a/mmpose/models/heads/regression_heads/motion_regression_head.py +++ b/mmpose/models/heads/regression_heads/motion_regression_head.py @@ -164,15 +164,15 @@ def loss(self, d.gt_instance_labels.lifting_target_label for d in batch_data_samples ]) - lifting_target_weights = torch.stack([ - d.gt_instance_labels.lifting_target_weights + lifting_target_weight = torch.stack([ + d.gt_instance_labels.lifting_target_weight for d in batch_data_samples ]) # calculate losses losses = dict() loss = self.loss_module(pred_outputs, lifting_target_label, - lifting_target_weights.unsqueeze(-1)) + lifting_target_weight.unsqueeze(-1)) losses.update(loss_pose3d=loss) @@ -180,7 +180,7 @@ def loss(self, mpjpe_err = keypoint_mpjpe( pred=to_numpy(pred_outputs), gt=to_numpy(lifting_target_label), - mask=to_numpy(lifting_target_weights) > 0) + mask=to_numpy(lifting_target_weight) > 0) mpjpe_pose = torch.tensor( mpjpe_err, device=lifting_target_label.device) diff --git a/mmpose/models/heads/regression_heads/temporal_regression_head.py b/mmpose/models/heads/regression_heads/temporal_regression_head.py index 9ed2e9f4fa..61e585103f 100644 --- a/mmpose/models/heads/regression_heads/temporal_regression_head.py +++ b/mmpose/models/heads/regression_heads/temporal_regression_head.py @@ -120,15 +120,15 @@ def loss(self, d.gt_instance_labels.lifting_target_label for d in batch_data_samples ]) - lifting_target_weights = torch.cat([ - d.gt_instance_labels.lifting_target_weights + lifting_target_weight = torch.cat([ + d.gt_instance_labels.lifting_target_weight for d in batch_data_samples ]) # calculate losses losses = dict() loss = self.loss_module(pred_outputs, lifting_target_label, - lifting_target_weights.unsqueeze(-1)) + lifting_target_weight.unsqueeze(-1)) losses.update(loss_pose3d=loss) @@ -136,7 +136,7 @@ def loss(self, _, avg_acc, _ = keypoint_pck_accuracy( pred=to_numpy(pred_outputs), gt=to_numpy(lifting_target_label), - mask=to_numpy(lifting_target_weights) > 0, + mask=to_numpy(lifting_target_weight) > 0, thr=0.05, norm_factor=np.ones((pred_outputs.size(0), 3), dtype=np.float32)) diff --git a/tests/test_codecs/test_image_pose_lifting.py b/tests/test_codecs/test_image_pose_lifting.py index 78b19ec59b..7033a3954c 100644 --- a/tests/test_codecs/test_image_pose_lifting.py +++ b/tests/test_codecs/test_image_pose_lifting.py @@ -59,7 +59,7 @@ def test_encode(self): self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2)) self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3)) - self.assertEqual(encoded['lifting_target_weights'].shape, ( + self.assertEqual(encoded['lifting_target_weight'].shape, ( 1, 17, )) @@ -80,7 +80,7 @@ def test_encode(self): self.assertTrue('target_root_removed' in encoded and 'target_root_index' in encoded) - self.assertEqual(encoded['lifting_target_weights'].shape, ( + self.assertEqual(encoded['lifting_target_weight'].shape, ( 1, 16, )) diff --git a/tests/test_codecs/test_motionbert_label.py b/tests/test_codecs/test_motionbert_label.py index a42b3d0793..596df463f7 100644 --- a/tests/test_codecs/test_motionbert_label.py +++ b/tests/test_codecs/test_motionbert_label.py @@ -69,7 +69,7 @@ def test_encode(self): self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2)) self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3)) - self.assertEqual(encoded['lifting_target_weights'].shape, ( + self.assertEqual(encoded['lifting_target_weight'].shape, ( 1, 17, )) diff --git a/tests/test_codecs/test_video_pose_lifting.py b/tests/test_codecs/test_video_pose_lifting.py index 31a095e927..6ffd70cffe 100644 --- a/tests/test_codecs/test_video_pose_lifting.py +++ b/tests/test_codecs/test_video_pose_lifting.py @@ -67,7 +67,7 @@ def test_encode(self): self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2)) self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3)) - self.assertEqual(encoded['lifting_target_weights'].shape, ( + self.assertEqual(encoded['lifting_target_weight'].shape, ( 1, 17, )) @@ -87,7 +87,7 @@ def test_encode(self): self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2)) self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3)) - self.assertEqual(encoded['lifting_target_weights'].shape, ( + self.assertEqual(encoded['lifting_target_weight'].shape, ( 1, 17, )) @@ -103,7 +103,7 @@ def test_encode(self): self.assertEqual(encoded['keypoint_labels'].shape, (34, 1)) self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3)) - self.assertEqual(encoded['lifting_target_weights'].shape, ( + self.assertEqual(encoded['lifting_target_weight'].shape, ( 1, 17, )) @@ -120,7 +120,7 @@ def test_encode(self): self.assertTrue('target_root_removed' in encoded and 'target_root_index' in encoded) - self.assertEqual(encoded['lifting_target_weights'].shape, ( + self.assertEqual(encoded['lifting_target_weight'].shape, ( 1, 16, )) @@ -160,7 +160,7 @@ def test_encode(self): self.assertEqual(encoded['keypoint_labels'].shape, (2, 17, 2)) self.assertEqual(encoded['lifting_target_label'].shape, (2, 17, 3)) - self.assertEqual(encoded['lifting_target_weights'].shape, ( + self.assertEqual(encoded['lifting_target_weight'].shape, ( 2, 17, ))