diff --git a/configs/body_3d_keypoint/pose_lift/README.md b/configs/body_3d_keypoint/pose_lift/README.md index 7e5f9f7e2a..e3e6ff7176 100644 --- a/configs/body_3d_keypoint/pose_lift/README.md +++ b/configs/body_3d_keypoint/pose_lift/README.md @@ -16,23 +16,19 @@ For single-person 3D pose estimation from a monocular camera, existing works can #### Human3.6m Dataset -| Arch | Receptive Field | MPJPE | P-MPJPE | N-MPJPE | ckpt | log | - -| :------------------------------------------------------ | :-------------: | :---: | :-----: | :-----: | :------------------------------------------------------: | :-----------------------------------------------------: | - -| [VideoPose3D-supervised](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py) | 27 | 40.1 | 30.1 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) | - -| [VideoPose3D-supervised](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py) | 81 | 39.1 | 29.3 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) | - -| [VideoPose3D-supervised](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py) | 243 | | | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) | - -| [VideoPose3D-supervised-CPN](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py) | 1 | 53.0 | 41.3 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) | - -| [VideoPose3D-supervised-CPN](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv-cpn-ft_8xb128-200e_h36m.py) | 243 | | | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) | - -| [VideoPose3D-semi-supervised](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv_8xb64-200e_h36m.py) | 27 | 57.2 | 42.4 | 54.2 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) | - -| [VideoPose3D-semi-supervised-CPN](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py) | 27 | 67.3 | 50.4 | 63.6 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) | +| Arch | MPJPE | P-MPJPE | N-MPJPE | ckpt | log | Details and Download | +| :-------------------------------------------- | :---: | :-----: | :-----: | :-------------------------------------------: | :------------------------------------------: | :---------------------------------------------: | +| [VideoPose3D-supervised-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py) | 40.1 | 30.1 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) | +| [VideoPose3D-supervised-81frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py) | 39.1 | 29.3 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) | +| [VideoPose3D-supervised-243frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py) | 37.6 | 28.3 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) | +| [VideoPose3D-supervised-CPN-1frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py) | 53.0 | 41.3 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) | +| [VideoPose3D-supervised-CPN-243frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv-cpn-ft_8xb128-200e_h36m.py) | 47.9 | 38.0 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) | +| [VideoPose3D-semi-supervised-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv_8xb64-200e_h36m.py) | 57.2 | 42.4 | 54.2 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) | +| [VideoPose3D-semi-supervised-CPN-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py) | 67.3 | 50.4 | 63.6 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) | +| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 35.3 | 27.7 | / | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) | / | [motionbert_h36m.md](./h36m/motionbert_h36m.md) | +| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 27.5 | 21.6 | / | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) | / | [motionbert_h36m.md](./h36m/motionbert_h36m.md) | + +*Models with * are converted from the official repo. The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* ## Image-based Single-view 3D Human Body Pose Estimation @@ -46,6 +42,6 @@ For single-person 3D pose estimation from a monocular camera, existing works can #### Human3.6m Dataset -| Arch | MPJPE | P-MPJPE | N-MPJPE | ckpt | log | -| :------------------------------------------------------ | :-------------: | :---: | :-----: | :-----: | :------------------------------------------------------: | :-----------------------------------------------------: | -| [SimpleBaseline3D-tcn](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_simplebaseline3d_8xb64-200e_h36m.py) | 43.4 | 34.3 | /|[ckpt](https://download.openmmlab.com/mmpose/body3d/simple_baseline/simple3Dbaseline_h36m-f0ad73a4_20210419.pth) | [log](https://download.openmmlab.com/mmpose/body3d/simple_baseline/20210415_065056.log.json) | +| Arch | MPJPE | P-MPJPE | N-MPJPE | ckpt | log | Details and Download | +| :---------------------------------------- | :---: | :-----: | :-----: | :---------------------------------------: | :---------------------------------------: | :--------------------------------------------------------: | +| [SimpleBaseline3D-tcn](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_simplebaseline3d_8xb64-200e_h36m.py) | 43.4 | 34.3 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/simple_baseline/simple3Dbaseline_h36m-f0ad73a4_20210419.pth) | [log](https://download.openmmlab.com/mmpose/body3d/simple_baseline/20210415_065056.log.json) | [simplebaseline3d_h36m.md](./h36m/simplebaseline3d_h36m.md) | diff --git a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md new file mode 100644 index 0000000000..d830d65c18 --- /dev/null +++ b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md @@ -0,0 +1,53 @@ + + +
+MotionBERT (2022) + +```bibtex + @misc{Zhu_Ma_Liu_Liu_Wu_Wang_2022, + title={Learning Human Motion Representations: A Unified Perspective}, + author={Zhu, Wentao and Ma, Xiaoxuan and Liu, Zhaoyang and Liu, Libin and Wu, Wayne and Wang, Yizhou}, + year={2022}, + month={Oct}, + language={en-US} + } +``` + +
+ + + +
+Human3.6M (TPAMI'2014) + +```bibtex +@article{h36m_pami, +author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu, Cristian}, +title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments}, +journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, +publisher = {IEEE Computer Society}, +volume = {36}, +number = {7}, +pages = {1325-1339}, +month = {jul}, +year = {2014} +} +``` + +
+ +Testing results on Human3.6M dataset with ground truth 2D detections + +| Arch | MPJPE | average MPJPE | P-MPJPE | ckpt | +| :-------------------------------------------------------------------------------------- | :---: | :-----------: | :-----: | :--------------------------------------------------------------------------------------: | +| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 35.3 | 35.3 | 27.7 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) | +| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 27.5 | 27.4 | 21.6 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) | + +Testing results on Human3.6M dataset from the [official repo](https://github.com/Walter0807/MotionBERT) with ground truth 2D detections + +| Arch | MPJPE | average MPJPE | P-MPJPE | ckpt | +| :-------------------------------------------------------------------------------------- | :---: | :-----------: | :-----: | :--------------------------------------------------------------------------------------: | +| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 40.5 | 39.9 | 34.1 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) | +| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 38.2 | 37.7 | 32.6 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) | + +*Models with * are converted from the [official repo](https://github.com/Walter0807/MotionBERT). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml new file mode 100644 index 0000000000..7257fea5a6 --- /dev/null +++ b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml @@ -0,0 +1,34 @@ +Collections: +- Name: MotionBERT + Paper: + Title: "Learning Human Motion Representations: A Unified Perspective" + URL: https://arxiv.org/abs/2210.06551 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/en/papers/algorithms/motionbert.md +Models: +- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert_8xb32-120e_h36m.py + In Collection: MotionBERT + Metadata: + Architecture: &id001 + - MotionBERT + Training Data: Human3.6M + Name: vid_pl_motionbert_8xb32-120e_h36m + Results: + - Dataset: Human3.6M + Metrics: + MPJPE: 35.3 + P-MPJPE: 27.7 + Task: Body 3D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth +- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert_8xb32-120e_h36m.py + In Collection: MotionBERT + Metadata: + Architecture: *id001 + Training Data: Human3.6M + Name: vid_pl_motionbert-finetuned_8xb32-120e_h36m + Results: + - Dataset: Human3.6M + Metrics: + MPJPE: 27.5 + P-MPJPE: 21.6 + Task: Body 3D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py new file mode 100644 index 0000000000..88f6c3897d --- /dev/null +++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py @@ -0,0 +1,140 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +train_cfg = dict(max_epochs=120, val_interval=10) + +# optimizer +optim_wrapper = dict( + optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01)) + +# learning policy +param_scheduler = [ + dict(type='ExponentialLR', gamma=0.99, end=120, by_epoch=True) +] + +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1), + logger=dict(type='LoggerHook', interval=20), +) + +# codec settings +train_codec = dict( + type='MotionBERTLabel', + num_keypoints=17, + concat_vis=True, + rootrel=True, + factor_label=False) +val_codec = dict( + type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True) + +# model settings +model = dict( + type='PoseLifter', + backbone=dict( + type='DSTFormer', + in_channels=3, + feat_size=512, + depth=5, + num_heads=8, + mlp_ratio=2, + seq_len=243, + att_fuse=True, + ), + head=dict( + type='MotionRegressionHead', + in_channels=512, + out_channels=3, + embedding_size=512, + loss=dict(type='MPJPEVelocityJointLoss'), + decoder=val_codec, + ), +) + +# base dataset settings +dataset_type = 'Human36mDataset' +data_root = 'data/h36m/' + +# pipelines +train_pipeline = [ + dict( + type='RandomFlipAroundRoot', + keypoints_flip_cfg={}, + target_flip_cfg={}, + flip_image=True), + dict(type='GenerateTarget', encoder=train_codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'factor', 'camera_param')) +] +val_pipeline = [ + dict(type='GenerateTarget', encoder=val_codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'factor', 'camera_param')) +] + +# data loaders +train_dataloader = dict( + batch_size=32, + prefetch_factor=4, + pin_memory=True, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_train.npz', + seq_len=1, + multiple_target=243, + multiple_target_step=81, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) + +val_dataloader = dict( + batch_size=32, + prefetch_factor=4, + pin_memory=True, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_test.npz', + seq_len=1, + seq_step=1, + multiple_target=243, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=val_pipeline, + test_mode=True, + )) +test_dataloader = val_dataloader + +# evaluators +skip_list = [ + 'S9_Greet', 'S9_SittingDown', 'S9_Wait_1', 'S9_Greeting', 'S9_Waiting_1' +] +val_evaluator = [ + dict(type='MPJPE', mode='mpjpe', skip_list=skip_list), + dict(type='MPJPE', mode='p-mpjpe', skip_list=skip_list) +] +test_evaluator = val_evaluator diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-160e_h36m.py similarity index 98% rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-160e_h36m.py index 0cbf89142d..c1190fe83e 100644 --- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py +++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-160e_h36m.py @@ -7,7 +7,7 @@ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') # runtime -train_cfg = dict(max_epochs=80, val_interval=10) +train_cfg = dict(max_epochs=160, val_interval=10) # optimizer optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-4)) diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-160e_h36m.py similarity index 98% rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-160e_h36m.py index 0f311ac5cf..0d241c498f 100644 --- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py +++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-160e_h36m.py @@ -7,7 +7,7 @@ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') # runtime -train_cfg = dict(max_epochs=80, val_interval=10) +train_cfg = dict(max_epochs=160, val_interval=10) # optimizer optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3)) diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-120e_h36m.py similarity index 98% rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-120e_h36m.py index 2589b493a6..803f907b7b 100644 --- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py +++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-120e_h36m.py @@ -7,7 +7,7 @@ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') # runtime -train_cfg = dict(max_epochs=80, val_interval=10) +train_cfg = dict(max_epochs=160, val_interval=10) # optimizer optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3)) diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-160e_h36m.py similarity index 98% rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-160e_h36m.py index f2c27e423d..4b370fe76e 100644 --- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py +++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-160e_h36m.py @@ -7,7 +7,7 @@ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') # runtime -train_cfg = dict(max_epochs=80, val_interval=10) +train_cfg = dict(max_epochs=160, val_interval=10) # optimizer optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3)) diff --git a/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.md b/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.md index f1c75d786a..48502c7b09 100644 --- a/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.md +++ b/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.md @@ -41,27 +41,27 @@ Testing results on Human3.6M dataset with ground truth 2D detections, supervised | Arch | Receptive Field | MPJPE | P-MPJPE | ckpt | log | | :--------------------------------------------------------- | :-------------: | :---: | :-----: | :--------------------------------------------------------: | :-------------------------------------------------------: | -| [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py) | 27 | 40.1 | 30.1 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) | -| [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py) | 81 | 39.1 | 29.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) | -| [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py) | 243 | | | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) | +| [VideoPose3D-supervised-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py) | 27 | 40.1 | 30.1 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) | +| [VideoPose3D-supervised-81frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py) | 81 | 39.1 | 29.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) | +| [VideoPose3D-supervised-243frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py) | 243 | 37.6 | 28.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) | Testing results on Human3.6M dataset with CPN 2D detections1, supervised training | Arch | Receptive Field | MPJPE | P-MPJPE | ckpt | log | | :--------------------------------------------------------- | :-------------: | :---: | :-----: | :--------------------------------------------------------: | :-------------------------------------------------------: | -| [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py) | 1 | 53.0 | 41.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) | -| [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv-cpn-ft_8xb128-200e_h36m.py) | 243 | | | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) | +| [VideoPose3D-supervised-CPN-1frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py) | 1 | 53.0 | 41.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) | +| [VideoPose3D-supervised-CPN-243frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv-cpn-ft_8xb128-200e_h36m.py) | 243 | 47.9 | 38.0 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) | Testing results on Human3.6M dataset with ground truth 2D detections, semi-supervised training | Training Data | Arch | Receptive Field | MPJPE | P-MPJPE | N-MPJPE | ckpt | log | | :------------ | :-------------------------------------------------: | :-------------: | :---: | :-----: | :-----: | :-------------------------------------------------: | :-------------------------------------------------: | -| 10% S1 | [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv_8xb64-200e_h36m.py) | 27 | 57.2 | 42.4 | 54.2 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) | +| 10% S1 | [VideoPose3D-semi-supervised-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv_8xb64-200e_h36m.py) | 27 | 57.2 | 42.4 | 54.2 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) | Testing results on Human3.6M dataset with CPN 2D detections1, semi-supervised training -| Training Data | Arch | Receptive Field | MPJPE | P-MPJPE | N-MPJPE | ckpt | log | -| :------------ | :----------------------------: | :-------------: | :---: | :-----: | :-----: | :------------------------------------------------------------: | :-----------------------------------------------------------: | -| 10% S1 | [VideoPose3D](/configs/xxx.py) | 27 | 67.3 | 50.4 | 63.6 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) | +| Training Data | Arch | Receptive Field | MPJPE | P-MPJPE | N-MPJPE | ckpt | log | +| :------------ | :-------------------------------------------------: | :-------------: | :---: | :-----: | :-----: | :-------------------------------------------------: | :-------------------------------------------------: | +| 10% S1 | [VideoPose3D-semi-supervised-CPN-27frm](/configs/xxx.py) | 27 | 67.3 | 50.4 | 63.6 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) | 1 CPN 2D detections are provided by [official repo](https://github.com/facebookresearch/VideoPose3D/blob/master/DATASETS.md). The reformatted version used in this repository can be downloaded from [train_detection](https://download.openmmlab.com/mmpose/body3d/videopose/cpn_ft_h36m_dbb_train.npy) and [test_detection](https://download.openmmlab.com/mmpose/body3d/videopose/cpn_ft_h36m_dbb_test.npy). diff --git a/mmpose/apis/inference_3d.py b/mmpose/apis/inference_3d.py index d5bb753945..d4b9623b86 100644 --- a/mmpose/apis/inference_3d.py +++ b/mmpose/apis/inference_3d.py @@ -316,8 +316,10 @@ def inference_pose_lifter_model(model, T, K, ), dtype=np.float32) - data_info['lifting_target'] = np.zeros((K, 3), dtype=np.float32) - data_info['lifting_target_visible'] = np.ones((K, 1), dtype=np.float32) + data_info['lifting_target'] = np.zeros((1, K, 3), dtype=np.float32) + data_info['factor'] = np.zeros((T, ), dtype=np.float32) + data_info['lifting_target_visible'] = np.ones((1, K, 1), + dtype=np.float32) if image_size is not None: assert len(image_size) == 2 diff --git a/mmpose/apis/inferencers/pose3d_inferencer.py b/mmpose/apis/inferencers/pose3d_inferencer.py index 0fe66ac72b..819273af66 100644 --- a/mmpose/apis/inferencers/pose3d_inferencer.py +++ b/mmpose/apis/inferencers/pose3d_inferencer.py @@ -271,8 +271,8 @@ def preprocess_single(self, K, ), dtype=np.float32) - data_info['lifting_target'] = np.zeros((K, 3), dtype=np.float32) - data_info['lifting_target_visible'] = np.ones((K, 1), + data_info['lifting_target'] = np.zeros((1, K, 3), dtype=np.float32) + data_info['lifting_target_visible'] = np.ones((1, K, 1), dtype=np.float32) data_info['camera_param'] = dict(w=width, h=height) diff --git a/mmpose/codecs/__init__.py b/mmpose/codecs/__init__.py index cdbd8feb0c..1a48b7f851 100644 --- a/mmpose/codecs/__init__.py +++ b/mmpose/codecs/__init__.py @@ -4,6 +4,7 @@ from .image_pose_lifting import ImagePoseLifting from .integral_regression_label import IntegralRegressionLabel from .megvii_heatmap import MegviiHeatmap +from .motionbert_label import MotionBERTLabel from .msra_heatmap import MSRAHeatmap from .regression_label import RegressionLabel from .simcc_label import SimCCLabel @@ -14,5 +15,6 @@ __all__ = [ 'MSRAHeatmap', 'MegviiHeatmap', 'UDPHeatmap', 'RegressionLabel', 'SimCCLabel', 'IntegralRegressionLabel', 'AssociativeEmbedding', 'SPR', - 'DecoupledHeatmap', 'VideoPoseLifting', 'ImagePoseLifting' + 'DecoupledHeatmap', 'VideoPoseLifting', 'ImagePoseLifting', + 'MotionBERTLabel' ] diff --git a/mmpose/codecs/image_pose_lifting.py b/mmpose/codecs/image_pose_lifting.py index 64bf925997..aae6c3b5be 100644 --- a/mmpose/codecs/image_pose_lifting.py +++ b/mmpose/codecs/image_pose_lifting.py @@ -25,6 +25,10 @@ class ImagePoseLifting(BaseKeypointCodec): Default: ``False``. save_index (bool): If true, store the root position separated from the original pose. Default: ``False``. + reshape_keypoints (bool): If true, reshape the keypoints into shape + (-1, N). Default: ``True``. + concat_vis (bool): If true, concat the visibility item of keypoints. + Default: ``False``. keypoints_mean (np.ndarray, optional): Mean values of keypoints coordinates in shape (K, D). keypoints_std (np.ndarray, optional): Std values of keypoints @@ -42,6 +46,8 @@ def __init__(self, root_index: int, remove_root: bool = False, save_index: bool = False, + reshape_keypoints: bool = True, + concat_vis: bool = False, keypoints_mean: Optional[np.ndarray] = None, keypoints_std: Optional[np.ndarray] = None, target_mean: Optional[np.ndarray] = None, @@ -52,9 +58,23 @@ def __init__(self, self.root_index = root_index self.remove_root = remove_root self.save_index = save_index - if keypoints_mean is not None and keypoints_std is not None: + self.reshape_keypoints = reshape_keypoints + self.concat_vis = concat_vis + if keypoints_mean is not None: + keypoints_mean = np.array( + keypoints_mean, + dtype=np.float32).reshape(1, num_keypoints, -1) + keypoints_std = np.array( + keypoints_std, dtype=np.float32).reshape(1, num_keypoints, -1) + assert keypoints_std is not None assert keypoints_mean.shape == keypoints_std.shape - if target_mean is not None and target_std is not None: + if target_mean is not None: + target_dim = num_keypoints - 1 if remove_root else num_keypoints + target_mean = np.array( + target_mean, dtype=np.float32).reshape(1, target_dim, -1) + target_std = np.array( + target_std, dtype=np.float32).reshape(1, target_dim, -1) + assert target_std is not None assert target_mean.shape == target_std.shape self.keypoints_mean = keypoints_mean self.keypoints_std = keypoints_std @@ -73,15 +93,17 @@ def encode(self, keypoints_visible (np.ndarray, optional): Keypoint visibilities in shape (N, K). lifting_target (np.ndarray, optional): 3d target coordinate in - shape (K, C). + shape (T, K, C). lifting_target_visible (np.ndarray, optional): Target coordinate in - shape (K, ). + shape (T, K, ). Returns: encoded (dict): Contains the following items: - keypoint_labels (np.ndarray): The processed keypoints in - shape (K * D, N) where D is 2 for 2d coordinates. + shape like (N, K, D) or (K * D, N). + - keypoint_labels_visible (np.ndarray): The processed + keypoints' weights in shape (N, K, ) or (N-1, K, ). - lifting_target_label: The processed target coordinate in shape (K, C) or (K-1, C). - lifting_target_weights (np.ndarray): The target weights in @@ -93,18 +115,20 @@ def encode(self, In addition, there are some optional items it may contain: + - target_root (np.ndarray): The root coordinate of target in + shape (C, ). Exists if ``zero_center`` is ``True``. - target_root_removed (bool): Indicate whether the root of - pose lifting target is removed. Added if ``self.remove_root`` - is ``True``. + pose-lifitng target is removed. Exists if + ``remove_root`` is ``True``. - target_root_index (int): An integer indicating the index of - root. Added if ``self.remove_root`` and ``self.save_index`` + root. Exists if ``remove_root`` and ``save_index`` are ``True``. """ if keypoints_visible is None: keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32) if lifting_target is None: - lifting_target = keypoints[0] + lifting_target = [keypoints[0]] # set initial value for `lifting_target_weights` # and `trajectory_weights` @@ -126,13 +150,16 @@ def encode(self, f'Got invalid joint shape {lifting_target.shape}' root = lifting_target[..., self.root_index, :] - lifting_target_label = lifting_target - root + lifting_target_label = lifting_target - lifting_target[ + ..., self.root_index:self.root_index + 1, :] if self.remove_root: lifting_target_label = np.delete( lifting_target_label, self.root_index, axis=-2) - assert lifting_target_weights.ndim in {1, 2} - axis_to_remove = -2 if lifting_target_weights.ndim == 2 else -1 + lifting_target_visible = np.delete( + lifting_target_visible, self.root_index, axis=-2) + assert lifting_target_weights.ndim in {2, 3} + axis_to_remove = -2 if lifting_target_weights.ndim == 3 else -1 lifting_target_weights = np.delete( lifting_target_weights, self.root_index, axis=axis_to_remove) # Add a flag to avoid latter transforms that rely on the root @@ -145,15 +172,17 @@ def encode(self, # Normalize the 2D keypoint coordinate with mean and std keypoint_labels = keypoints.copy() - if self.keypoints_mean is not None and self.keypoints_std is not None: - keypoints_shape = keypoints.shape - assert self.keypoints_mean.shape == keypoints_shape[1:] + if self.keypoints_mean is not None: + assert self.keypoints_mean.shape[1:] == keypoints.shape[1:] + encoded['keypoints_mean'] = self.keypoints_mean.copy() + encoded['keypoints_std'] = self.keypoints_std.copy() keypoint_labels = (keypoint_labels - self.keypoints_mean) / self.keypoints_std - if self.target_mean is not None and self.target_std is not None: - target_shape = lifting_target_label.shape - assert self.target_mean.shape == target_shape + if self.target_mean is not None: + assert self.target_mean.shape == lifting_target_label.shape + encoded['target_mean'] = self.target_mean.copy() + encoded['target_std'] = self.target_std.copy() lifting_target_label = (lifting_target_label - self.target_mean) / self.target_std @@ -163,7 +192,19 @@ def encode(self, if keypoint_labels.ndim == 2: keypoint_labels = keypoint_labels[None, ...] + if self.concat_vis: + keypoints_visible_ = keypoints_visible + if keypoints_visible.ndim == 2: + keypoints_visible_ = keypoints_visible[..., None] + keypoint_labels = np.concatenate( + (keypoint_labels, keypoints_visible_), axis=2) + + if self.reshape_keypoints: + N = keypoint_labels.shape[0] + keypoint_labels = keypoint_labels.transpose(1, 2, 0).reshape(-1, N) + encoded['keypoint_labels'] = keypoint_labels + encoded['keypoint_labels_visible'] = keypoints_visible encoded['lifting_target_label'] = lifting_target_label encoded['lifting_target_weights'] = lifting_target_weights encoded['trajectory_weights'] = trajectory_weights @@ -190,11 +231,11 @@ def decode(self, keypoints = encoded.copy() if self.target_mean is not None and self.target_std is not None: - assert self.target_mean.shape == keypoints.shape[1:] + assert self.target_mean.shape == keypoints.shape keypoints = keypoints * self.target_std + self.target_mean - if target_root.size > 0: - keypoints = keypoints + np.expand_dims(target_root, axis=0) + if target_root is not None and target_root.size > 0: + keypoints = keypoints + target_root if self.remove_root: keypoints = np.insert( keypoints, self.root_index, target_root, axis=1) diff --git a/mmpose/codecs/motionbert_label.py b/mmpose/codecs/motionbert_label.py new file mode 100644 index 0000000000..d0c8cd0d40 --- /dev/null +++ b/mmpose/codecs/motionbert_label.py @@ -0,0 +1,218 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from copy import deepcopy +from typing import Optional, Tuple + +import numpy as np + +from mmpose.registry import KEYPOINT_CODECS +from .base import BaseKeypointCodec +from .utils import camera_to_image_coord + + +@KEYPOINT_CODECS.register_module() +class MotionBERTLabel(BaseKeypointCodec): + r"""Generate keypoint and label coordinates for `MotionBERT`_ by Zhu et al + (2022). + + Note: + + - instance number: N + - keypoint number: K + - keypoint dimension: D + - pose-lifitng target dimension: C + + Args: + num_keypoints (int): The number of keypoints in the dataset. + root_index (int): Root keypoint index in the pose. Default: 0. + remove_root (bool): If true, remove the root keypoint from the pose. + Default: ``False``. + save_index (bool): If true, store the root position separated from the + original pose, only takes effect if ``remove_root`` is ``True``. + Default: ``False``. + concat_vis (bool): If true, concat the visibility item of keypoints. + Default: ``False``. + rootrel (bool): If true, the root keypoint will be set to the + coordinate origin. Default: ``False``. + factor_label (bool): If true, the label will be multiplied by a factor. + Default: ``True``. + """ + + auxiliary_encode_keys = { + 'lifting_target', 'lifting_target_visible', 'camera_param', 'factor' + } + + def __init__(self, + num_keypoints: int, + root_index: int = 0, + remove_root: bool = False, + save_index: bool = False, + concat_vis: bool = False, + rootrel: bool = False, + factor_label: bool = True): + super().__init__() + + self.num_keypoints = num_keypoints + self.root_index = root_index + self.remove_root = remove_root + self.save_index = save_index + self.concat_vis = concat_vis + self.rootrel = rootrel + self.factor_label = factor_label + + def encode(self, + keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None, + lifting_target: Optional[np.ndarray] = None, + lifting_target_visible: Optional[np.ndarray] = None, + camera_param: Optional[dict] = None, + factor: Optional[np.ndarray] = None) -> dict: + """Encoding keypoints from input image space to normalized space. + + Args: + keypoints (np.ndarray): Keypoint coordinates in shape (B, T, K, D). + keypoints_visible (np.ndarray, optional): Keypoint visibilities in + shape (B, T, K). + lifting_target (np.ndarray, optional): 3d target coordinate in + shape (T, K, C). + lifting_target_visible (np.ndarray, optional): Target coordinate in + shape (T, K, ). + camera_param (dict, optional): The camera parameter dictionary. + factor (np.ndarray, optional): The factor mapping camera and image + coordinate in shape (T, ). + + Returns: + encoded (dict): Contains the following items: + + - keypoint_labels (np.ndarray): The processed keypoints in + shape like (N, K, D). + - keypoint_labels_visible (np.ndarray): The processed + keypoints' weights in shape (N, K, ) or (N, K-1, ). + - lifting_target_label: The processed target coordinate in + shape (K, C) or (K-1, C). + - lifting_target_weights (np.ndarray): The target weights in + shape (K, ) or (K-1, ). + - trajectory_weights (np.ndarray): The trajectory weights in + shape (K, ). + - factor (np.ndarray): The factor mapping camera and image + coordinate in shape (T, 1). + """ + if keypoints_visible is None: + keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32) + + if lifting_target is None: + lifting_target = [keypoints[..., 0, :, :]] + + # set initial value for `lifting_target_weights` + # and `trajectory_weights` + if lifting_target_visible is None: + lifting_target_visible = np.ones( + lifting_target.shape[:-1], dtype=np.float32) + lifting_target_weights = lifting_target_visible + trajectory_weights = (1 / lifting_target[:, 2]) + else: + valid = lifting_target_visible > 0.5 + lifting_target_weights = np.where(valid, 1., 0.).astype(np.float32) + trajectory_weights = lifting_target_weights + + if camera_param is None: + camera_param = dict() + + encoded = dict() + + lifting_target_label = lifting_target.copy() + keypoint_labels = keypoints.copy() + + assert keypoint_labels.ndim in {2, 3} + if keypoint_labels.ndim == 2: + keypoint_labels = keypoint_labels[None, ...] + + # Normalize the 2D keypoint coordinate with image width and height + _camera_param = deepcopy(camera_param) + assert 'w' in _camera_param and 'h' in _camera_param + w, h = _camera_param['w'], _camera_param['h'] + keypoint_labels[ + ..., :2] = keypoint_labels[..., :2] / w * 2 - [1, h / w] + + # convert target to image coordinate + T = keypoint_labels.shape[0] + factor_ = np.array([4] * T, dtype=np.float32).reshape(T, ) + if 'f' in _camera_param and 'c' in _camera_param: + lifting_target_label, factor_ = camera_to_image_coord( + self.root_index, lifting_target_label, _camera_param) + lifting_target_label[..., :, :] = lifting_target_label[ + ..., :, :] - lifting_target_label[..., + self.root_index:self.root_index + + 1, :] + if factor is None or factor[0] == 0: + factor = factor_ + if factor.ndim == 1: + factor = factor[:, None] + if self.factor_label: + lifting_target_label *= factor[..., None] + + if self.concat_vis: + keypoints_visible_ = keypoints_visible + if keypoints_visible.ndim == 2: + keypoints_visible_ = keypoints_visible[..., None] + keypoint_labels = np.concatenate( + (keypoint_labels, keypoints_visible_), axis=2) + + encoded['keypoint_labels'] = keypoint_labels + encoded['keypoint_labels_visible'] = keypoints_visible + encoded['lifting_target_label'] = lifting_target_label + encoded['lifting_target_weights'] = lifting_target_weights + encoded['lifting_target'] = lifting_target_label + encoded['lifting_target_visible'] = lifting_target_visible + encoded['trajectory_weights'] = trajectory_weights + encoded['factor'] = factor + + return encoded + + def decode( + self, + encoded: np.ndarray, + w: Optional[np.ndarray] = None, + h: Optional[np.ndarray] = None, + factor: Optional[np.ndarray] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + """Decode keypoint coordinates from normalized space to input image + space. + + Args: + encoded (np.ndarray): Coordinates in shape (N, K, C). + w (np.ndarray, optional): The image widths in shape (N, ). + Default: ``None``. + h (np.ndarray, optional): The image heights in shape (N, ). + Default: ``None``. + factor (np.ndarray, optional): The factor for projection in shape + (N, ). Default: ``None``. + + Returns: + keypoints (np.ndarray): Decoded coordinates in shape (N, K, C). + scores (np.ndarray): The keypoint scores in shape (N, K). + """ + keypoints = encoded.copy() + scores = np.ones(keypoints.shape[:-1], dtype=np.float32) + + if self.rootrel: + keypoints[..., 0, :] = 0 + + if w is not None and w.size > 0: + assert w.shape == h.shape + assert w.shape[0] == keypoints.shape[0] + assert w.ndim in {1, 2} + if w.ndim == 1: + w = w[:, None] + h = h[:, None] + trans = np.append( + np.ones((w.shape[0], 1)), h / w, axis=1)[:, None, :] + keypoints[..., :2] = (keypoints[..., :2] + trans) * w[:, None] / 2 + keypoints[..., 2:] = keypoints[..., 2:] * w[:, None] / 2 + if factor is not None and factor.size > 0: + assert factor.shape[0] == keypoints.shape[0] + keypoints *= factor[..., None] + keypoints[..., :, :] = keypoints[..., :, :] - keypoints[ + ..., self.root_index:self.root_index + 1, :] + keypoints /= 1000. + return keypoints, scores diff --git a/mmpose/codecs/utils/__init__.py b/mmpose/codecs/utils/__init__.py index eaa093f12b..38bbae5c39 100644 --- a/mmpose/codecs/utils/__init__.py +++ b/mmpose/codecs/utils/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .camera_image_projection import camera_to_image_coord, camera_to_pixel from .gaussian_heatmap import (generate_gaussian_heatmaps, generate_udp_gaussian_heatmaps, generate_unbiased_gaussian_heatmaps) @@ -19,5 +20,6 @@ 'batch_heatmap_nms', 'refine_keypoints', 'refine_keypoints_dark', 'refine_keypoints_dark_udp', 'generate_displacement_heatmap', 'refine_simcc_dark', 'gaussian_blur1d', 'get_diagonal_lengths', - 'get_instance_root', 'get_instance_bbox', 'get_simcc_normalized' + 'get_instance_root', 'get_instance_bbox', 'get_simcc_normalized', + 'camera_to_image_coord', 'camera_to_pixel' ] diff --git a/mmpose/codecs/utils/camera_image_projection.py b/mmpose/codecs/utils/camera_image_projection.py new file mode 100644 index 0000000000..5ed4d14109 --- /dev/null +++ b/mmpose/codecs/utils/camera_image_projection.py @@ -0,0 +1,69 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Tuple + +import numpy as np + + +def camera_to_image_coord(root_index: int, kpts_3d_cam: np.ndarray, + camera_param: Dict) -> Tuple[np.ndarray, np.ndarray]: + """Project keypoints from camera space to image space and calculate factor. + + Args: + root_index (int): Index for root keypoint. + kpts_3d_cam (np.ndarray): Keypoint coordinates in camera space in + shape (N, K, D). + camera_param (dict): Parameters for the camera. + + Returns: + tuple: + - kpts_3d_image (np.ndarray): Keypoint coordinates in image space in + shape (N, K, D). + - factor (np.ndarray): The scaling factor that maps keypoints from + image space to camera space in shape (N, ). + """ + + root = kpts_3d_cam[..., root_index, :] + tl_kpt = root.copy() + tl_kpt[..., :2] -= 1.0 + br_kpt = root.copy() + br_kpt[..., :2] += 1.0 + tl_kpt = np.reshape(tl_kpt, (-1, 3)) + br_kpt = np.reshape(br_kpt, (-1, 3)) + fx, fy = camera_param['f'] / 1000. + cx, cy = camera_param['c'] / 1000. + + tl2d = camera_to_pixel(tl_kpt, fx, fy, cx, cy) + br2d = camera_to_pixel(br_kpt, fx, fy, cx, cy) + + rectangle_3d_size = 2.0 + kpts_3d_image = np.zeros_like(kpts_3d_cam) + kpts_3d_image[..., :2] = camera_to_pixel(kpts_3d_cam.copy(), fx, fy, cx, + cy) + ratio = (br2d[..., 0] - tl2d[..., 0] + 0.001) / rectangle_3d_size + factor = rectangle_3d_size / (br2d[..., 0] - tl2d[..., 0] + 0.001) + kpts_3d_depth = ratio[:, None] * ( + kpts_3d_cam[..., 2] - kpts_3d_cam[..., root_index:root_index + 1, 2]) + kpts_3d_image[..., 2] = kpts_3d_depth + return kpts_3d_image, factor + + +def camera_to_pixel(kpts_3d: np.ndarray, fx: float, fy: float, cx: float, + cy: float) -> np.ndarray: + """Project keypoints from camera space to image space. + + Args: + kpts_3d (np.ndarray): Keypoint coordinates in camera space. + fx (float): x-coordinate of camera's focal length. + fy (float): y-coordinate of camera's focal length. + cx (float): x-coordinate of image center. + cy (float): y-coordinate of image center. + + Returns: + pose_2d (np.ndarray): Projected keypoint coordinates in image space. + """ + pose_2d = kpts_3d[..., :2] / kpts_3d[..., 2:3] + pose_2d[..., 0] *= fx + pose_2d[..., 1] *= fy + pose_2d[..., 0] += cx + pose_2d[..., 1] += cy + return pose_2d diff --git a/mmpose/codecs/video_pose_lifting.py b/mmpose/codecs/video_pose_lifting.py index 56cf35fa2d..9e409a663c 100644 --- a/mmpose/codecs/video_pose_lifting.py +++ b/mmpose/codecs/video_pose_lifting.py @@ -30,6 +30,10 @@ class VideoPoseLifting(BaseKeypointCodec): save_index (bool): If true, store the root position separated from the original pose, only takes effect if ``remove_root`` is ``True``. Default: ``False``. + reshape_keypoints (bool): If true, reshape the keypoints into shape + (-1, N). Default: ``True``. + concat_vis (bool): If true, concat the visibility item of keypoints. + Default: ``False``. normalize_camera (bool): Whether to normalize camera intrinsics. Default: ``False``. """ @@ -44,6 +48,8 @@ def __init__(self, root_index: int = 0, remove_root: bool = False, save_index: bool = False, + reshape_keypoints: bool = True, + concat_vis: bool = False, normalize_camera: bool = False): super().__init__() @@ -52,6 +58,8 @@ def __init__(self, self.root_index = root_index self.remove_root = remove_root self.save_index = save_index + self.reshape_keypoints = reshape_keypoints + self.concat_vis = concat_vis self.normalize_camera = normalize_camera def encode(self, @@ -67,16 +75,18 @@ def encode(self, keypoints_visible (np.ndarray, optional): Keypoint visibilities in shape (N, K). lifting_target (np.ndarray, optional): 3d target coordinate in - shape (K, C). + shape (T, K, C). lifting_target_visible (np.ndarray, optional): Target coordinate in - shape (K, ). + shape (T, K, ). camera_param (dict, optional): The camera parameter dictionary. Returns: encoded (dict): Contains the following items: - keypoint_labels (np.ndarray): The processed keypoints in - shape (K * D, N) where D is 2 for 2d coordinates. + shape like (N, K, D) or (K * D, N). + - keypoint_labels_visible (np.ndarray): The processed + keypoints' weights in shape (N, K, ) or (N-1, K, ). - lifting_target_label: The processed target coordinate in shape (K, C) or (K-1, C). - lifting_target_weights (np.ndarray): The target weights in @@ -87,21 +97,21 @@ def encode(self, In addition, there are some optional items it may contain: - target_root (np.ndarray): The root coordinate of target in - shape (C, ). Exists if ``self.zero_center`` is ``True``. + shape (C, ). Exists if ``zero_center`` is ``True``. - target_root_removed (bool): Indicate whether the root of pose-lifitng target is removed. Exists if - ``self.remove_root`` is ``True``. + ``remove_root`` is ``True``. - target_root_index (int): An integer indicating the index of - root. Exists if ``self.remove_root`` and ``self.save_index`` + root. Exists if ``remove_root`` and ``save_index`` are ``True``. - camera_param (dict): The updated camera parameter dictionary. - Exists if ``self.normalize_camera`` is ``True``. + Exists if ``normalize_camera`` is ``True``. """ if keypoints_visible is None: keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32) if lifting_target is None: - lifting_target = keypoints[0] + lifting_target = [keypoints[0]] # set initial value for `lifting_target_weights` # and `trajectory_weights` @@ -128,14 +138,17 @@ def encode(self, f'Got invalid joint shape {lifting_target.shape}' root = lifting_target[..., self.root_index, :] - lifting_target_label = lifting_target_label - root + lifting_target_label -= lifting_target_label[ + ..., self.root_index:self.root_index + 1, :] encoded['target_root'] = root if self.remove_root: lifting_target_label = np.delete( lifting_target_label, self.root_index, axis=-2) - assert lifting_target_weights.ndim in {1, 2} - axis_to_remove = -2 if lifting_target_weights.ndim == 2 else -1 + lifting_target_visible = np.delete( + lifting_target_visible, self.root_index, axis=-2) + assert lifting_target_weights.ndim in {2, 3} + axis_to_remove = -2 if lifting_target_weights.ndim == 3 else -1 lifting_target_weights = np.delete( lifting_target_weights, self.root_index, @@ -167,7 +180,19 @@ def encode(self, _camera_param['c'] = (_camera_param['c'] - center[:, None]) / scale encoded['camera_param'] = _camera_param + if self.concat_vis: + keypoints_visible_ = keypoints_visible + if keypoints_visible.ndim == 2: + keypoints_visible_ = keypoints_visible[..., None] + keypoint_labels = np.concatenate( + (keypoint_labels, keypoints_visible_), axis=2) + + if self.reshape_keypoints: + N = keypoint_labels.shape[0] + keypoint_labels = keypoint_labels.transpose(1, 2, 0).reshape(-1, N) + encoded['keypoint_labels'] = keypoint_labels + encoded['keypoints_visible'] = keypoints_visible encoded['lifting_target_label'] = lifting_target_label encoded['lifting_target_weights'] = lifting_target_weights encoded['trajectory_weights'] = trajectory_weights @@ -192,8 +217,8 @@ def decode(self, """ keypoints = encoded.copy() - if target_root.size > 0: - keypoints = keypoints + np.expand_dims(target_root, axis=0) + if target_root is not None and target_root.size > 0: + keypoints = keypoints + target_root if self.remove_root: keypoints = np.insert( keypoints, self.root_index, target_root, axis=1) diff --git a/mmpose/datasets/datasets/base/base_mocap_dataset.py b/mmpose/datasets/datasets/base/base_mocap_dataset.py index d671a6ae94..e08ba6ea45 100644 --- a/mmpose/datasets/datasets/base/base_mocap_dataset.py +++ b/mmpose/datasets/datasets/base/base_mocap_dataset.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import itertools import os.path as osp from copy import deepcopy from itertools import filterfalse, groupby @@ -21,6 +22,8 @@ class BaseMocapDataset(BaseDataset): Args: ann_file (str): Annotation file path. Default: ''. seq_len (int): Number of frames in a sequence. Default: 1. + multiple_target (int): If larger than 0, merge every + ``multiple_target`` sequence together. Default: 0. causal (bool): If set to ``True``, the rightmost input frame will be the target frame. Otherwise, the middle input frame will be the target frame. Default: ``True``. @@ -63,6 +66,7 @@ class BaseMocapDataset(BaseDataset): def __init__(self, ann_file: str = '', seq_len: int = 1, + multiple_target: int = 0, causal: bool = True, subset_frac: float = 1.0, camera_param_file: Optional[str] = None, @@ -102,6 +106,10 @@ def __init__(self, self.seq_len = seq_len self.causal = causal + self.multiple_target = multiple_target + if self.multiple_target: + assert (self.seq_len == 1) + assert 0 < subset_frac <= 1, ( f'Unsupported `subset_frac` {subset_frac}. Supported range ' 'is (0, 1].') @@ -241,6 +249,17 @@ def get_sequence_indices(self) -> List[List[int]]: sequence_indices = [[idx] for idx in range(num_imgs)] else: raise NotImplementedError('Multi-frame data sample unsupported!') + + if self.multiple_target > 0: + sequence_indices_merged = [] + for i in range(0, len(sequence_indices), self.multiple_target): + if i + self.multiple_target > len(sequence_indices): + break + sequence_indices_merged.append( + list( + itertools.chain.from_iterable( + sequence_indices[i:i + self.multiple_target]))) + sequence_indices = sequence_indices_merged return sequence_indices def _load_annotations(self) -> Tuple[List[dict], List[dict]]: @@ -274,7 +293,9 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]: image_list = [] for idx, frame_ids in enumerate(self.sequence_indices): - assert len(frame_ids) == self.seq_len + assert len(frame_ids) == (self.multiple_target + if self.multiple_target else + self.seq_len), f'{len(frame_ids)}' _img_names = img_names[frame_ids] @@ -286,7 +307,9 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]: keypoints_3d = _keypoints_3d[..., :3] keypoints_3d_visible = _keypoints_3d[..., 3] - target_idx = -1 if self.causal else int(self.seq_len) // 2 + target_idx = [-1] if self.causal else [int(self.seq_len) // 2] + if self.multiple_target: + target_idx = list(range(self.multiple_target)) instance_info = { 'num_keypoints': num_keypoints, diff --git a/mmpose/datasets/datasets/body3d/h36m_dataset.py b/mmpose/datasets/datasets/body3d/h36m_dataset.py index 60094aa254..b7a4f71d65 100644 --- a/mmpose/datasets/datasets/body3d/h36m_dataset.py +++ b/mmpose/datasets/datasets/body3d/h36m_dataset.py @@ -45,6 +45,10 @@ class Human36mDataset(BaseMocapDataset): seq_len (int): Number of frames in a sequence. Default: 1. seq_step (int): The interval for extracting frames from the video. Default: 1. + multiple_target (int): If larger than 0, merge every + ``multiple_target`` sequence together. Default: 0. + multiple_target_step (int): The interval for merging sequence. Only + valid when ``multiple_target`` is larger than 0. Default: 0. pad_video_seq (bool): Whether to pad the video so that poses will be predicted for every frame in the video. Default: ``False``. causal (bool): If set to ``True``, the rightmost input frame will be @@ -65,6 +69,9 @@ class Human36mDataset(BaseMocapDataset): If set, 2d keypoint loaded from this file will be used instead of ground-truth keypoints. This setting is only when ``keypoint_2d_src`` is ``'detection'``. Default: ``None``. + factor_file (str, optional): The projection factors' file. If set, + factor loaded from this file will be used instead of calculated + factors. Default: ``None``. camera_param_file (str): Cameras' parameters file. Default: ``None``. data_mode (str): Specifies the mode of data samples: ``'topdown'`` or ``'bottomup'``. In ``'topdown'`` mode, each data sample contains @@ -104,11 +111,14 @@ def __init__(self, ann_file: str = '', seq_len: int = 1, seq_step: int = 1, + multiple_target: int = 0, + multiple_target_step: int = 0, pad_video_seq: bool = False, causal: bool = True, subset_frac: float = 1.0, keypoint_2d_src: str = 'gt', keypoint_2d_det_file: Optional[str] = None, + factor_file: Optional[str] = None, camera_param_file: Optional[str] = None, data_mode: str = 'topdown', metainfo: Optional[dict] = None, @@ -138,9 +148,20 @@ def __init__(self, self.seq_step = seq_step self.pad_video_seq = pad_video_seq + if factor_file: + if not is_abs(factor_file): + factor_file = osp.join(data_root, factor_file) + assert exists(factor_file), 'Annotation file does not exist.' + self.factor_file = factor_file + + if multiple_target > 0 and multiple_target_step == 0: + multiple_target_step = multiple_target + self.multiple_target_step = multiple_target_step + super().__init__( ann_file=ann_file, seq_len=seq_len, + multiple_target=multiple_target, causal=causal, subset_frac=subset_frac, camera_param_file=camera_param_file, @@ -171,41 +192,55 @@ def get_sequence_indices(self) -> List[List[int]]: sequence_indices = [] _len = (self.seq_len - 1) * self.seq_step + 1 _step = self.seq_step - for _, _indices in sorted(video_frames.items()): - n_frame = len(_indices) - - if self.pad_video_seq: - # Pad the sequence so that every frame in the sequence will be - # predicted. - if self.causal: - frames_left = self.seq_len - 1 - frames_right = 0 - else: - frames_left = (self.seq_len - 1) // 2 - frames_right = frames_left - for i in range(n_frame): - pad_left = max(0, frames_left - i // _step) - pad_right = max(0, - frames_right - (n_frame - 1 - i) // _step) - start = max(i % _step, i - frames_left * _step) - end = min(n_frame - (n_frame - 1 - i) % _step, - i + frames_right * _step + 1) - sequence_indices.append([_indices[0]] * pad_left + - _indices[start:end:_step] + - [_indices[-1]] * pad_right) - else: + + if self.multiple_target: + for _, _indices in sorted(video_frames.items()): + n_frame = len(_indices) seqs_from_video = [ - _indices[i:(i + _len):_step] - for i in range(0, n_frame - _len + 1) - ] + _indices[i:(i + self.multiple_target):_step] + for i in range(0, n_frame, self.multiple_target_step) + ][:(n_frame + self.multiple_target_step - + self.multiple_target) // self.multiple_target_step] sequence_indices.extend(seqs_from_video) + else: + for _, _indices in sorted(video_frames.items()): + n_frame = len(_indices) + + if self.pad_video_seq: + # Pad the sequence so that every frame in the sequence will + # be predicted. + if self.causal: + frames_left = self.seq_len - 1 + frames_right = 0 + else: + frames_left = (self.seq_len - 1) // 2 + frames_right = frames_left + for i in range(n_frame): + pad_left = max(0, frames_left - i // _step) + pad_right = max( + 0, frames_right - (n_frame - 1 - i) // _step) + start = max(i % _step, i - frames_left * _step) + end = min(n_frame - (n_frame - 1 - i) % _step, + i + frames_right * _step + 1) + sequence_indices.append([_indices[0]] * pad_left + + _indices[start:end:_step] + + [_indices[-1]] * pad_right) + else: + seqs_from_video = [ + _indices[i:(i + _len):_step] + for i in range(0, n_frame - _len + 1) + ] + sequence_indices.extend(seqs_from_video) + # reduce dataset size if needed subset_size = int(len(sequence_indices) * self.subset_frac) start = np.random.randint(0, len(sequence_indices) - subset_size + 1) end = start + subset_size - return sequence_indices[start:end] + sequence_indices = sequence_indices[start:end] + + return sequence_indices def _load_annotations(self) -> Tuple[List[dict], List[dict]]: instance_list, image_list = super()._load_annotations() @@ -230,6 +265,15 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]: 'keypoints_visible': keypoints_visible }) + if self.factor_file: + with get_local_path(self.factor_file) as local_path: + factors = np.load(local_path).astype(np.float32) + else: + factors = np.zeros((kpts_3d.shape[0], ), dtype=np.float32) + assert factors.shape[0] == kpts_3d.shape[0] + for idx, frame_ids in enumerate(self.sequence_indices): + factor = factors[frame_ids].astype(np.float32) + instance_list[idx].update({'factor': factor}) return instance_list, image_list diff --git a/mmpose/datasets/transforms/formatting.py b/mmpose/datasets/transforms/formatting.py index 05aeef179f..d047cff3c3 100644 --- a/mmpose/datasets/transforms/formatting.py +++ b/mmpose/datasets/transforms/formatting.py @@ -51,8 +51,6 @@ def keypoints_to_tensor(keypoints: Union[np.ndarray, Sequence[np.ndarray]] """ if isinstance(keypoints, np.ndarray): keypoints = np.ascontiguousarray(keypoints) - N = keypoints.shape[0] - keypoints = keypoints.transpose(1, 2, 0).reshape(-1, N) tensor = torch.from_numpy(keypoints).contiguous() else: assert is_seq_of(keypoints, np.ndarray) @@ -209,9 +207,9 @@ def transform(self, results: dict) -> dict: for key, packed_key in self.label_mapping_table.items(): if key in results: # For pose-lifting, store only target-related fields - if 'lifting_target_label' in results and key in { + if 'lifting_target' in results and packed_key in { 'keypoint_labels', 'keypoint_weights', - 'transformed_keypoints_visible' + 'keypoints_visible' }: continue if isinstance(results[key], list): diff --git a/mmpose/datasets/transforms/pose3d_transforms.py b/mmpose/datasets/transforms/pose3d_transforms.py index e6559fa398..2149d7cb30 100644 --- a/mmpose/datasets/transforms/pose3d_transforms.py +++ b/mmpose/datasets/transforms/pose3d_transforms.py @@ -25,6 +25,8 @@ class RandomFlipAroundRoot(BaseTransform): flip_prob (float): Probability of flip. Default: 0.5. flip_camera (bool): Whether to flip horizontal distortion coefficients. Default: ``False``. + flip_image (bool): Whether to flip keypoints horizontally according + to image size. Default: ``False``. Required keys: keypoints @@ -39,14 +41,16 @@ def __init__(self, keypoints_flip_cfg, target_flip_cfg, flip_prob=0.5, - flip_camera=False): + flip_camera=False, + flip_image=False): self.keypoints_flip_cfg = keypoints_flip_cfg self.target_flip_cfg = target_flip_cfg self.flip_prob = flip_prob self.flip_camera = flip_camera + self.flip_image = flip_image def transform(self, results: Dict) -> dict: - """The transform function of :class:`ZeroCenterPose`. + """The transform function of :class:`RandomFlipAroundRoot`. See ``transform()`` method of :class:`BaseTransform` for details. @@ -76,6 +80,15 @@ def transform(self, results: Dict) -> dict: flip_indices = results['flip_indices'] # flip joint coordinates + _camera_param = deepcopy(results['camera_param']) + if self.flip_image: + assert 'camera_param' in results, \ + 'Camera parameters are missing.' + assert 'w' in _camera_param + w = _camera_param['w'] / 2 + self.keypoints_flip_cfg['center_x'] = w + self.target_flip_cfg['center_x'] = w + keypoints, keypoints_visible = flip_keypoints_custom_center( keypoints, keypoints_visible, flip_indices, **self.keypoints_flip_cfg) @@ -92,7 +105,6 @@ def transform(self, results: Dict) -> dict: if self.flip_camera: assert 'camera_param' in results, \ 'Camera parameters are missing.' - _camera_param = deepcopy(results['camera_param']) assert 'c' in _camera_param _camera_param['c'][0] *= -1 diff --git a/mmpose/evaluation/metrics/keypoint_3d_metrics.py b/mmpose/evaluation/metrics/keypoint_3d_metrics.py index e945650c30..fb3447bb3f 100644 --- a/mmpose/evaluation/metrics/keypoint_3d_metrics.py +++ b/mmpose/evaluation/metrics/keypoint_3d_metrics.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from collections import defaultdict from os import path as osp -from typing import Dict, Optional, Sequence +from typing import Dict, List, Optional, Sequence import numpy as np from mmengine.evaluator import BaseMetric @@ -38,6 +38,8 @@ class MPJPE(BaseMetric): names to disambiguate homonymous metrics of different evaluators. If prefix is not provided in the argument, ``self.default_prefix`` will be used instead. Default: ``None``. + skip_list (list, optional): The list of subject and action combinations + to be skipped. Default: []. """ ALIGNMENT = {'mpjpe': 'none', 'p-mpjpe': 'procrustes', 'n-mpjpe': 'scale'} @@ -45,7 +47,8 @@ class MPJPE(BaseMetric): def __init__(self, mode: str = 'mpjpe', collect_device: str = 'cpu', - prefix: Optional[str] = None) -> None: + prefix: Optional[str] = None, + skip_list: List[str] = []) -> None: super().__init__(collect_device=collect_device, prefix=prefix) allowed_modes = self.ALIGNMENT.keys() if mode not in allowed_modes: @@ -53,6 +56,7 @@ def __init__(self, f"'n-mpjpe', but got '{mode}'.") self.mode = mode + self.skip_list = skip_list def process(self, data_batch: Sequence[dict], data_samples: Sequence[dict]) -> None: @@ -67,24 +71,32 @@ def process(self, data_batch: Sequence[dict], the model. """ for data_sample in data_samples: - # predicted keypoints coordinates, [1, K, D] + # predicted keypoints coordinates, [T, K, D] pred_coords = data_sample['pred_instances']['keypoints'] + if pred_coords.ndim == 4: + pred_coords = np.squeeze(pred_coords, axis=0) # ground truth data_info gt = data_sample['gt_instances'] - # ground truth keypoints coordinates, [1, K, D] + # ground truth keypoints coordinates, [T, K, D] gt_coords = gt['lifting_target'] - # ground truth keypoints_visible, [1, K, 1] - mask = gt['lifting_target_visible'].astype(bool).reshape(1, -1) + # ground truth keypoints_visible, [T, K, 1] + mask = gt['lifting_target_visible'].astype(bool).reshape( + gt_coords.shape[0], -1) # instance action - img_path = data_sample['target_img_path'] + img_path = data_sample['target_img_path'][0] _, rest = osp.basename(img_path).split('_', 1) action, _ = rest.split('.', 1) + actions = np.array([action] * gt_coords.shape[0]) + + subj_act = osp.basename(img_path).split('.')[0] + if subj_act in self.skip_list: + continue result = { 'pred_coords': pred_coords, 'gt_coords': gt_coords, 'mask': mask, - 'action': action + 'actions': actions } self.results.append(result) @@ -104,16 +116,15 @@ def compute_metrics(self, results: list) -> Dict[str, float]: # pred_coords: [N, K, D] pred_coords = np.concatenate( [result['pred_coords'] for result in results]) - if pred_coords.ndim == 4 and pred_coords.shape[1] == 1: - pred_coords = np.squeeze(pred_coords, axis=1) # gt_coords: [N, K, D] - gt_coords = np.stack([result['gt_coords'] for result in results]) + gt_coords = np.concatenate([result['gt_coords'] for result in results]) # mask: [N, K] mask = np.concatenate([result['mask'] for result in results]) # action_category_indices: Dict[List[int]] action_category_indices = defaultdict(list) - for idx, result in enumerate(results): - action_category = result['action'].split('_')[0] + actions = np.concatenate([result['actions'] for result in results]) + for idx, action in enumerate(actions): + action_category = action.split('_')[0] action_category_indices[action_category].append(idx) error_name = self.mode.upper() @@ -126,6 +137,7 @@ def compute_metrics(self, results: list) -> Dict[str, float]: for action_category, indices in action_category_indices.items(): metrics[f'{error_name}_{action_category}'] = keypoint_mpjpe( - pred_coords[indices], gt_coords[indices], mask[indices]) + pred_coords[indices], gt_coords[indices], mask[indices], + self.ALIGNMENT[self.mode]) return metrics diff --git a/mmpose/models/backbones/__init__.py b/mmpose/models/backbones/__init__.py index cb2498560a..563264eecf 100644 --- a/mmpose/models/backbones/__init__.py +++ b/mmpose/models/backbones/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from .alexnet import AlexNet from .cpm import CPM +from .dstformer import DSTFormer from .hourglass import HourglassNet from .hourglass_ae import HourglassAENet from .hrformer import HRFormer @@ -33,5 +34,5 @@ 'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN', 'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3', 'LiteHRNet', 'V2VNet', 'HRFormer', 'PyramidVisionTransformer', - 'PyramidVisionTransformerV2', 'SwinTransformer' + 'PyramidVisionTransformerV2', 'SwinTransformer', 'DSTFormer' ] diff --git a/mmpose/models/backbones/dstformer.py b/mmpose/models/backbones/dstformer.py new file mode 100644 index 0000000000..2ef13bdb02 --- /dev/null +++ b/mmpose/models/backbones/dstformer.py @@ -0,0 +1,304 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmcv.cnn.bricks import DropPath +from mmengine.model import BaseModule, constant_init +from mmengine.model.weight_init import trunc_normal_ + +from mmpose.registry import MODELS +from .base_backbone import BaseBackbone + + +class Attention(BaseModule): + + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0., + mode='spatial'): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.mode = mode + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.proj_drop = nn.Dropout(proj_drop) + + self.attn_count_s = None + self.attn_count_t = None + + def forward(self, x, seq_len=1): + B, N, C = x.shape + + if self.mode == 'temporal': + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // + self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[ + 2] # make torchscript happy (cannot use tensor as tuple) + x = self.forward_temporal(q, k, v, seq_len=seq_len) + elif self.mode == 'spatial': + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // + self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[ + 2] # make torchscript happy (cannot use tensor as tuple) + x = self.forward_spatial(q, k, v) + else: + raise NotImplementedError(self.mode) + x = self.proj(x) + x = self.proj_drop(x) + return x + + def forward_spatial(self, q, k, v): + B, _, N, C = q.shape + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = attn @ v + x = x.transpose(1, 2).reshape(B, N, C * self.num_heads) + return x + + def forward_temporal(self, q, k, v, seq_len=8): + B, _, N, C = q.shape + qt = q.reshape(-1, seq_len, self.num_heads, N, + C).permute(0, 2, 3, 1, 4) # (B, H, N, T, C) + kt = k.reshape(-1, seq_len, self.num_heads, N, + C).permute(0, 2, 3, 1, 4) # (B, H, N, T, C) + vt = v.reshape(-1, seq_len, self.num_heads, N, + C).permute(0, 2, 3, 1, 4) # (B, H, N, T, C) + + attn = (qt @ kt.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = attn @ vt # (B, H, N, T, C) + x = x.permute(0, 3, 2, 1, 4).reshape(B, N, C * self.num_heads) + return x + + +class AttentionBlock(BaseModule): + + def __init__(self, + dim, + num_heads, + mlp_ratio=4., + mlp_out_ratio=1., + qkv_bias=True, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + st_mode='st'): + super().__init__() + + self.st_mode = st_mode + self.norm1_s = nn.LayerNorm(dim, eps=1e-06) + self.norm1_t = nn.LayerNorm(dim, eps=1e-06) + + self.attn_s = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + mode='spatial') + self.attn_t = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + mode='temporal') + + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + self.norm2_s = nn.LayerNorm(dim, eps=1e-06) + self.norm2_t = nn.LayerNorm(dim, eps=1e-06) + + mlp_hidden_dim = int(dim * mlp_ratio) + mlp_out_dim = int(dim * mlp_out_ratio) + self.mlp_s = nn.Sequential( + nn.Linear(dim, mlp_hidden_dim), nn.GELU(), + nn.Linear(mlp_hidden_dim, mlp_out_dim), nn.Dropout(drop)) + self.mlp_t = nn.Sequential( + nn.Linear(dim, mlp_hidden_dim), nn.GELU(), + nn.Linear(mlp_hidden_dim, mlp_out_dim), nn.Dropout(drop)) + + def forward(self, x, seq_len=1): + if self.st_mode == 'st': + x = x + self.drop_path(self.attn_s(self.norm1_s(x), seq_len)) + x = x + self.drop_path(self.mlp_s(self.norm2_s(x))) + x = x + self.drop_path(self.attn_t(self.norm1_t(x), seq_len)) + x = x + self.drop_path(self.mlp_t(self.norm2_t(x))) + elif self.st_mode == 'ts': + x = x + self.drop_path(self.attn_t(self.norm1_t(x), seq_len)) + x = x + self.drop_path(self.mlp_t(self.norm2_t(x))) + x = x + self.drop_path(self.attn_s(self.norm1_s(x), seq_len)) + x = x + self.drop_path(self.mlp_s(self.norm2_s(x))) + else: + raise NotImplementedError(self.st_mode) + return x + + +@MODELS.register_module() +class DSTFormer(BaseBackbone): + """Dual-stream Spatio-temporal Transformer Module. + + Args: + in_channels (int): Number of input channels. + feat_size: Number of feature channels. Default: 256. + depth: The network depth. Default: 5. + num_heads: Number of heads in multi-Head self-attention blocks. + Default: 8. + mlp_ratio (int, optional): The expansion ratio of FFN. Default: 4. + num_keypoints: num_keypoints (int): Number of keypoints. Default: 17. + seq_len: The sequence length. Default: 243. + qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. + Default: True. + qk_scale (float | None, optional): Override default qk scale of + head_dim ** -0.5 if set. Default: None. + drop_rate (float, optional): Dropout ratio of input. Default: 0. + attn_drop_rate (float, optional): Dropout ratio of attention weight. + Default: 0. + drop_path_rate (float, optional): Stochastic depth rate. Default: 0. + att_fuse: Whether to fuse the results of attention blocks. + Default: True. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + + Example: + >>> from mmpose.models import DSTFormer + >>> import torch + >>> self = DSTFormer(in_channels=3) + >>> self.eval() + >>> inputs = torch.rand(1, 2, 17, 3) + >>> level_outputs = self.forward(inputs) + >>> print(tuple(level_outputs.shape)) + (1, 2, 17, 512) + """ + + def __init__(self, + in_channels, + feat_size=256, + depth=5, + num_heads=8, + mlp_ratio=4, + num_keypoints=17, + seq_len=243, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + att_fuse=True, + init_cfg=None): + super().__init__(init_cfg=init_cfg) + + self.in_channels = in_channels + self.feat_size = feat_size + + self.joints_embed = nn.Linear(in_channels, feat_size) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + + self.blocks_st = nn.ModuleList([ + AttentionBlock( + dim=feat_size, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + st_mode='st') for i in range(depth) + ]) + self.blocks_ts = nn.ModuleList([ + AttentionBlock( + dim=feat_size, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + st_mode='ts') for i in range(depth) + ]) + + self.norm = nn.LayerNorm(feat_size, eps=1e-06) + + self.temp_embed = nn.Parameter(torch.zeros(1, seq_len, 1, feat_size)) + self.spat_embed = nn.Parameter( + torch.zeros(1, num_keypoints, feat_size)) + + trunc_normal_(self.temp_embed, std=.02) + trunc_normal_(self.spat_embed, std=.02) + + self.att_fuse = att_fuse + if self.att_fuse: + self.attn_regress = nn.ModuleList( + [nn.Linear(feat_size * 2, 2) for i in range(depth)]) + for i in range(depth): + self.attn_regress[i].weight.data.fill_(0) + self.attn_regress[i].bias.data.fill_(0.5) + + def forward(self, x): + if len(x.shape) == 3: + x = x[None, :] + assert len(x.shape) == 4 + + B, F, K, C = x.shape + x = x.reshape(-1, K, C) + BF = x.shape[0] + x = self.joints_embed(x) # (BF, K, feat_size) + x = x + self.spat_embed + _, K, C = x.shape + x = x.reshape(-1, F, K, C) + self.temp_embed[:, :F, :, :] + x = x.reshape(BF, K, C) # (BF, K, feat_size) + x = self.pos_drop(x) + + for idx, (blk_st, + blk_ts) in enumerate(zip(self.blocks_st, self.blocks_ts)): + x_st = blk_st(x, F) + x_ts = blk_ts(x, F) + if self.att_fuse: + att = self.attn_regress[idx] + alpha = torch.cat([x_st, x_ts], dim=-1) + BF, K = alpha.shape[:2] + alpha = att(alpha) + alpha = alpha.softmax(dim=-1) + x = x_st * alpha[:, :, 0:1] + x_ts * alpha[:, :, 1:2] + else: + x = (x_st + x_ts) * 0.5 + x = self.norm(x) # (BF, K, feat_size) + x = x.reshape(B, F, K, -1) + return x + + def init_weights(self): + """Initialize the weights in backbone.""" + super(DSTFormer, self).init_weights() + + if (isinstance(self.init_cfg, dict) + and self.init_cfg['type'] == 'Pretrained'): + return + + for m in self.modules(): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + constant_init(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + constant_init(m.bias, 0) + constant_init(m.weight, 1.0) diff --git a/mmpose/models/heads/__init__.py b/mmpose/models/heads/__init__.py index e01f2269e3..ef0e17d98e 100644 --- a/mmpose/models/heads/__init__.py +++ b/mmpose/models/heads/__init__.py @@ -5,7 +5,8 @@ HeatmapHead, MSPNHead, ViPNASHead) from .hybrid_heads import DEKRHead, VisPredictHead from .regression_heads import (DSNTHead, IntegralRegressionHead, - RegressionHead, RLEHead, TemporalRegressionHead, + MotionRegressionHead, RegressionHead, RLEHead, + TemporalRegressionHead, TrajectoryRegressionHead) __all__ = [ @@ -13,5 +14,5 @@ 'RegressionHead', 'IntegralRegressionHead', 'SimCCHead', 'RLEHead', 'DSNTHead', 'AssociativeEmbeddingHead', 'DEKRHead', 'VisPredictHead', 'CIDHead', 'RTMCCHead', 'TemporalRegressionHead', - 'TrajectoryRegressionHead' + 'TrajectoryRegressionHead', 'MotionRegressionHead' ] diff --git a/mmpose/models/heads/regression_heads/__init__.py b/mmpose/models/heads/regression_heads/__init__.py index ce9cd5e1b0..729d193b51 100644 --- a/mmpose/models/heads/regression_heads/__init__.py +++ b/mmpose/models/heads/regression_heads/__init__.py @@ -1,16 +1,14 @@ # Copyright (c) OpenMMLab. All rights reserved. from .dsnt_head import DSNTHead from .integral_regression_head import IntegralRegressionHead +from .motion_regression_head import MotionRegressionHead from .regression_head import RegressionHead from .rle_head import RLEHead from .temporal_regression_head import TemporalRegressionHead from .trajectory_regression_head import TrajectoryRegressionHead __all__ = [ - 'RegressionHead', - 'IntegralRegressionHead', - 'DSNTHead', - 'RLEHead', - 'TemporalRegressionHead', - 'TrajectoryRegressionHead', + 'RegressionHead', 'IntegralRegressionHead', 'DSNTHead', 'RLEHead', + 'TemporalRegressionHead', 'TrajectoryRegressionHead', + 'MotionRegressionHead' ] diff --git a/mmpose/models/heads/regression_heads/motion_regression_head.py b/mmpose/models/heads/regression_heads/motion_regression_head.py new file mode 100644 index 0000000000..a0037180c7 --- /dev/null +++ b/mmpose/models/heads/regression_heads/motion_regression_head.py @@ -0,0 +1,176 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict +from typing import Tuple + +import numpy as np +import torch +from torch import Tensor, nn + +from mmpose.evaluation.functional import keypoint_mpjpe +from mmpose.registry import KEYPOINT_CODECS, MODELS +from mmpose.utils.tensor_utils import to_numpy +from mmpose.utils.typing import (ConfigType, OptConfigType, OptSampleList, + Predictions) +from ..base_head import BaseHead + + +@MODELS.register_module() +class MotionRegressionHead(BaseHead): + """Regression head of `MotionBERT`_ by Zhu et al (2022). + + Args: + in_channels (int): Number of input channels. Default: 256. + out_channels (int): Number of output channels. Default: 3. + embedding_size (int): Number of embedding channels. Default: 512. + loss (Config): Config for keypoint loss. Defaults to use + :class:`MSELoss` + decoder (Config, optional): The decoder config that controls decoding + keypoint coordinates from the network output. Defaults to ``None`` + init_cfg (Config, optional): Config to control the initialization. See + :attr:`default_init_cfg` for default settings + + .. _`MotionBERT`: https://arxiv.org/abs/2210.06551 + """ + + _version = 2 + + def __init__(self, + in_channels: int = 256, + out_channels: int = 3, + embedding_size: int = 512, + loss: ConfigType = dict( + type='MSELoss', use_target_weight=True), + decoder: OptConfigType = None, + init_cfg: OptConfigType = None): + + if init_cfg is None: + init_cfg = self.default_init_cfg + + super().__init__(init_cfg) + + self.in_channels = in_channels + self.out_channels = out_channels + self.loss_module = MODELS.build(loss) + if decoder is not None: + self.decoder = KEYPOINT_CODECS.build(decoder) + else: + self.decoder = None + + # Define fully-connected layers + self.pre_logits = nn.Sequential( + OrderedDict([('fc', nn.Linear(in_channels, embedding_size)), + ('act', nn.Tanh())])) + self.fc = nn.Linear( + embedding_size, + out_channels) if embedding_size > 0 else nn.Identity() + + def forward(self, feats: Tuple[Tensor]) -> Tensor: + """Forward the network. The input is multi scale feature maps and the + output is the coordinates. + + Args: + feats (Tuple[Tensor]): Multi scale feature maps. + + Returns: + Tensor: Output coordinates (and sigmas[optional]). + """ + x = feats # (B, F, K, in_channels) + x = self.pre_logits(x) # (B, F, K, embedding_size) + x = self.fc(x) # (B, F, K, out_channels) + + return x + + def predict(self, + feats: Tuple[Tensor], + batch_data_samples: OptSampleList, + test_cfg: ConfigType = {}) -> Predictions: + """Predict results from outputs. + + Returns: + preds (sequence[InstanceData]): Prediction results. + Each contains the following fields: + + - keypoints: Predicted keypoints of shape (B, N, K, D). + - keypoint_scores: Scores of predicted keypoints of shape + (B, N, K). + """ + + batch_coords = self.forward(feats) # (B, K, D) + + # Restore global position with camera_param and factor + camera_param = batch_data_samples[0].metainfo.get('camera_param', None) + if camera_param is not None: + w = torch.stack([ + torch.from_numpy(np.array([b.metainfo['camera_param']['w']])) + for b in batch_data_samples + ]) + h = torch.stack([ + torch.from_numpy(np.array([b.metainfo['camera_param']['h']])) + for b in batch_data_samples + ]) + else: + w = torch.stack([ + torch.empty((0), dtype=torch.float32) + for _ in batch_data_samples + ]) + h = torch.stack([ + torch.empty((0), dtype=torch.float32) + for _ in batch_data_samples + ]) + + factor = batch_data_samples[0].metainfo.get('factor', None) + if factor is not None: + factor = torch.stack([ + torch.from_numpy(b.metainfo['factor']) + for b in batch_data_samples + ]) + else: + factor = torch.stack([ + torch.empty((0), dtype=torch.float32) + for _ in batch_data_samples + ]) + + preds = self.decode((batch_coords, w, h, factor)) + + return preds + + def loss(self, + inputs: Tuple[Tensor], + batch_data_samples: OptSampleList, + train_cfg: ConfigType = {}) -> dict: + """Calculate losses from a batch of inputs and data samples.""" + + pred_outputs = self.forward(inputs) + + lifting_target_label = torch.stack([ + d.gt_instance_labels.lifting_target_label + for d in batch_data_samples + ]) + lifting_target_weights = torch.stack([ + d.gt_instance_labels.lifting_target_weights + for d in batch_data_samples + ]) + + # calculate losses + losses = dict() + loss = self.loss_module(pred_outputs, lifting_target_label, + lifting_target_weights.unsqueeze(-1)) + + losses.update(loss_pose3d=loss) + + # calculate accuracy + mpjpe_err = keypoint_mpjpe( + pred=to_numpy(pred_outputs), + gt=to_numpy(lifting_target_label), + mask=to_numpy(lifting_target_weights) > 0) + + mpjpe_pose = torch.tensor( + mpjpe_err, device=lifting_target_label.device) + losses.update(mpjpe=mpjpe_pose) + + return losses + + @property + def default_init_cfg(self): + init_cfg = [dict(type='TruncNormal', layer=['Linear'], std=0.02)] + return init_cfg diff --git a/mmpose/models/heads/regression_heads/temporal_regression_head.py b/mmpose/models/heads/regression_heads/temporal_regression_head.py index ac76316842..9ed2e9f4fa 100644 --- a/mmpose/models/heads/regression_heads/temporal_regression_head.py +++ b/mmpose/models/heads/regression_heads/temporal_regression_head.py @@ -101,7 +101,7 @@ def predict(self, else: target_root = torch.stack([ torch.empty((0), dtype=torch.float32) - for _ in batch_data_samples[0].metainfo + for _ in batch_data_samples ]) preds = self.decode((batch_coords, target_root)) diff --git a/mmpose/models/heads/regression_heads/trajectory_regression_head.py b/mmpose/models/heads/regression_heads/trajectory_regression_head.py index adfd7353d3..a1608aaae7 100644 --- a/mmpose/models/heads/regression_heads/trajectory_regression_head.py +++ b/mmpose/models/heads/regression_heads/trajectory_regression_head.py @@ -101,7 +101,7 @@ def predict(self, else: target_root = torch.stack([ torch.empty((0), dtype=torch.float32) - for _ in batch_data_samples[0].metainfo + for _ in batch_data_samples ]) preds = self.decode((batch_coords, target_root)) diff --git a/mmpose/models/losses/regression_loss.py b/mmpose/models/losses/regression_loss.py index 9a64a4adfe..b50ad99f04 100644 --- a/mmpose/models/losses/regression_loss.py +++ b/mmpose/models/losses/regression_loss.py @@ -365,6 +365,84 @@ def forward(self, output, target, target_weight=None): return loss * self.loss_weight +@MODELS.register_module() +class MPJPEVelocityJointLoss(nn.Module): + """MPJPE (Mean Per Joint Position Error) loss. + + Args: + loss_weight (float): Weight of the loss. Default: 1.0. + lambda_scale (float): Factor of the N-MPJPE loss. Default: 0.5. + lambda_3d_velocity (float): Factor of the velocity loss. Default: 20.0. + """ + + def __init__(self, + use_target_weight=False, + loss_weight=1., + lambda_scale=0.5, + lambda_3d_velocity=20.0): + super().__init__() + self.use_target_weight = use_target_weight + self.loss_weight = loss_weight + self.lambda_scale = lambda_scale + self.lambda_3d_velocity = lambda_3d_velocity + + def forward(self, output, target, target_weight=None): + """Forward function. + + Note: + - batch_size: N + - num_keypoints: K + - dimension of keypoints: D (D=2 or D=3) + + Args: + output (torch.Tensor[N, K, D]): Output regression. + target (torch.Tensor[N, K, D]): Target regression. + target_weight (torch.Tensor[N,K,D]): + Weights across different joint types. + """ + norm_output = torch.mean( + torch.sum(torch.square(output), dim=-1, keepdim=True), + dim=-2, + keepdim=True) + norm_target = torch.mean( + torch.sum(target * output, dim=-1, keepdim=True), + dim=-2, + keepdim=True) + + velocity_output = output[..., 1:, :, :] - output[..., :-1, :, :] + velocity_target = target[..., 1:, :, :] - target[..., :-1, :, :] + + if self.use_target_weight: + assert target_weight is not None + mpjpe = torch.mean( + torch.norm((output - target) * target_weight, dim=-1)) + + nmpjpe = torch.mean( + torch.norm( + (norm_target / norm_output * output - target) * + target_weight, + dim=-1)) + + loss_3d_velocity = torch.mean( + torch.norm( + (velocity_output - velocity_target) * target_weight, + dim=-1)) + else: + mpjpe = torch.mean(torch.norm(output - target, dim=-1)) + + nmpjpe = torch.mean( + torch.norm( + norm_target / norm_output * output - target, dim=-1)) + + loss_3d_velocity = torch.mean( + torch.norm(velocity_output - velocity_target, dim=-1)) + + loss = mpjpe + nmpjpe * self.lambda_scale + \ + loss_3d_velocity * self.lambda_3d_velocity + + return loss * self.loss_weight + + @MODELS.register_module() class MPJPELoss(nn.Module): """MPJPE (Mean Per Joint Position Error) loss. diff --git a/tests/test_codecs/test_image_pose_lifting.py b/tests/test_codecs/test_image_pose_lifting.py index bb94786c32..78b19ec59b 100644 --- a/tests/test_codecs/test_image_pose_lifting.py +++ b/tests/test_codecs/test_image_pose_lifting.py @@ -13,14 +13,18 @@ def setUp(self) -> None: keypoints = (0.1 + 0.8 * np.random.rand(1, 17, 2)) * [192, 256] keypoints = np.round(keypoints).astype(np.float32) keypoints_visible = np.random.randint(2, size=(1, 17)) - lifting_target = (0.1 + 0.8 * np.random.rand(17, 3)) - lifting_target_visible = np.random.randint(2, size=(17, )) + lifting_target = (0.1 + 0.8 * np.random.rand(1, 17, 3)) + lifting_target_visible = np.random.randint( + 2, size=( + 1, + 17, + )) encoded_wo_sigma = np.random.rand(1, 17, 3) self.keypoints_mean = np.random.rand(17, 2).astype(np.float32) self.keypoints_std = np.random.rand(17, 2).astype(np.float32) + 1e-6 - self.target_mean = np.random.rand(17, 3).astype(np.float32) - self.target_std = np.random.rand(17, 3).astype(np.float32) + 1e-6 + self.target_mean = np.random.rand(1, 17, 3).astype(np.float32) + self.target_std = np.random.rand(1, 17, 3).astype(np.float32) + 1e-6 self.data = dict( keypoints=keypoints, @@ -30,7 +34,11 @@ def setUp(self) -> None: encoded_wo_sigma=encoded_wo_sigma) def build_pose_lifting_label(self, **kwargs): - cfg = dict(type='ImagePoseLifting', num_keypoints=17, root_index=0) + cfg = dict( + type='ImagePoseLifting', + num_keypoints=17, + root_index=0, + reshape_keypoints=False) cfg.update(kwargs) return KEYPOINT_CODECS.build(cfg) @@ -50,10 +58,19 @@ def test_encode(self): lifting_target_visible) self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2)) - self.assertEqual(encoded['lifting_target_label'].shape, (17, 3)) - self.assertEqual(encoded['lifting_target_weights'].shape, (17, )) - self.assertEqual(encoded['trajectory_weights'].shape, (17, )) - self.assertEqual(encoded['target_root'].shape, (3, )) + self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3)) + self.assertEqual(encoded['lifting_target_weights'].shape, ( + 1, + 17, + )) + self.assertEqual(encoded['trajectory_weights'].shape, ( + 1, + 17, + )) + self.assertEqual(encoded['target_root'].shape, ( + 1, + 3, + )) # test removing root codec = self.build_pose_lifting_label( @@ -63,10 +80,16 @@ def test_encode(self): self.assertTrue('target_root_removed' in encoded and 'target_root_index' in encoded) - self.assertEqual(encoded['lifting_target_weights'].shape, (16, )) + self.assertEqual(encoded['lifting_target_weights'].shape, ( + 1, + 16, + )) self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2)) - self.assertEqual(encoded['lifting_target_label'].shape, (16, 3)) - self.assertEqual(encoded['target_root'].shape, (3, )) + self.assertEqual(encoded['lifting_target_label'].shape, (1, 16, 3)) + self.assertEqual(encoded['target_root'].shape, ( + 1, + 3, + )) # test normalization codec = self.build_pose_lifting_label( @@ -78,7 +101,7 @@ def test_encode(self): lifting_target_visible) self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2)) - self.assertEqual(encoded['lifting_target_label'].shape, (17, 3)) + self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3)) def test_decode(self): lifting_target = self.data['lifting_target'] @@ -112,12 +135,10 @@ def test_cicular_verification(self): lifting_target_visible) _keypoints, _ = codec.decode( - np.expand_dims(encoded['lifting_target_label'], axis=0), + encoded['lifting_target_label'], target_root=lifting_target[..., 0, :]) - self.assertTrue( - np.allclose( - np.expand_dims(lifting_target, axis=0), _keypoints, atol=5.)) + self.assertTrue(np.allclose(lifting_target, _keypoints, atol=5.)) # test removing root codec = self.build_pose_lifting_label(remove_root=True) @@ -125,12 +146,10 @@ def test_cicular_verification(self): lifting_target_visible) _keypoints, _ = codec.decode( - np.expand_dims(encoded['lifting_target_label'], axis=0), + encoded['lifting_target_label'], target_root=lifting_target[..., 0, :]) - self.assertTrue( - np.allclose( - np.expand_dims(lifting_target, axis=0), _keypoints, atol=5.)) + self.assertTrue(np.allclose(lifting_target, _keypoints, atol=5.)) # test normalization codec = self.build_pose_lifting_label( @@ -142,9 +161,7 @@ def test_cicular_verification(self): lifting_target_visible) _keypoints, _ = codec.decode( - np.expand_dims(encoded['lifting_target_label'], axis=0), + encoded['lifting_target_label'], target_root=lifting_target[..., 0, :]) - self.assertTrue( - np.allclose( - np.expand_dims(lifting_target, axis=0), _keypoints, atol=5.)) + self.assertTrue(np.allclose(lifting_target, _keypoints, atol=5.)) diff --git a/tests/test_codecs/test_motionbert_label.py b/tests/test_codecs/test_motionbert_label.py new file mode 100644 index 0000000000..01c9c654a2 --- /dev/null +++ b/tests/test_codecs/test_motionbert_label.py @@ -0,0 +1,159 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from unittest import TestCase + +import numpy as np +from mmengine.fileio import load + +from mmpose.codecs import MotionBERTLabel +from mmpose.registry import KEYPOINT_CODECS + + +class TestMotionBERTLabel(TestCase): + + def get_camera_param(self, imgname, camera_param) -> dict: + """Get camera parameters of a frame by its image name.""" + subj, rest = osp.basename(imgname).split('_', 1) + action, rest = rest.split('.', 1) + camera, rest = rest.split('_', 1) + return camera_param[(subj, camera)] + + def build_pose_lifting_label(self, **kwargs): + cfg = dict(type='MotionBERTLabel', num_keypoints=17) + cfg.update(kwargs) + return KEYPOINT_CODECS.build(cfg) + + def setUp(self) -> None: + keypoints = (0.1 + 0.8 * np.random.rand(1, 17, 2)) * [1000, 1002] + keypoints = np.round(keypoints).astype(np.float32) + keypoints_visible = np.random.randint(2, size=(1, 17)) + lifting_target = (0.1 + 0.8 * np.random.rand(1, 17, 3)) + lifting_target_visible = np.random.randint( + 2, size=( + 1, + 17, + )) + encoded_wo_sigma = np.random.rand(1, 17, 3) + + camera_param = load('tests/data/h36m/cameras.pkl') + camera_param = self.get_camera_param( + 'S1/S1_Directions_1.54138969/S1_Directions_1.54138969_000001.jpg', + camera_param) + factor = 0.1 + 5 * np.random.rand(1, ) + + self.data = dict( + keypoints=keypoints, + keypoints_visible=keypoints_visible, + lifting_target=lifting_target, + lifting_target_visible=lifting_target_visible, + camera_param=camera_param, + factor=factor, + encoded_wo_sigma=encoded_wo_sigma) + + def test_build(self): + codec = self.build_pose_lifting_label() + self.assertIsInstance(codec, MotionBERTLabel) + + def test_encode(self): + keypoints = self.data['keypoints'] + keypoints_visible = self.data['keypoints_visible'] + lifting_target = self.data['lifting_target'] + lifting_target_visible = self.data['lifting_target_visible'] + camera_param = self.data['camera_param'] + factor = self.data['factor'] + + # test default settings + codec = self.build_pose_lifting_label() + encoded = codec.encode(keypoints, keypoints_visible, lifting_target, + lifting_target_visible, camera_param, factor) + + self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2)) + self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3)) + self.assertEqual(encoded['lifting_target_weights'].shape, ( + 1, + 17, + )) + self.assertEqual(encoded['trajectory_weights'].shape, ( + 1, + 17, + )) + + # test concatenating visibility + codec = self.build_pose_lifting_label(concat_vis=True) + encoded = codec.encode(keypoints, keypoints_visible, lifting_target, + lifting_target_visible, camera_param, factor) + + self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 3)) + self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3)) + + def test_decode(self): + encoded_wo_sigma = self.data['encoded_wo_sigma'] + camera_param = self.data['camera_param'] + + # test default settings + codec = self.build_pose_lifting_label() + + decoded, scores = codec.decode(encoded_wo_sigma) + + self.assertEqual(decoded.shape, (1, 17, 3)) + self.assertEqual(scores.shape, (1, 17)) + + # test denormalize according to image shape + codec = self.build_pose_lifting_label() + + decoded, scores = codec.decode( + encoded_wo_sigma, + w=np.array([camera_param['w']]), + h=np.array([camera_param['h']])) + + self.assertEqual(decoded.shape, (1, 17, 3)) + self.assertEqual(scores.shape, (1, 17)) + + # test with factor + codec = self.build_pose_lifting_label() + + decoded, scores = codec.decode( + encoded_wo_sigma, factor=np.array([0.23])) + + self.assertEqual(decoded.shape, (1, 17, 3)) + self.assertEqual(scores.shape, (1, 17)) + + def test_cicular_verification(self): + keypoints_visible = self.data['keypoints_visible'] + lifting_target = self.data['lifting_target'] + lifting_target_visible = self.data['lifting_target_visible'] + camera_param = self.data['camera_param'] + + # test denormalize according to image shape + keypoints = (0.1 + 0.8 * np.random.rand(1, 17, 3)) + codec = self.build_pose_lifting_label() + encoded = codec.encode(keypoints, keypoints_visible, lifting_target, + lifting_target_visible, camera_param) + + _keypoints, _ = codec.decode( + encoded['keypoint_labels'], + w=np.array([camera_param['w']]), + h=np.array([camera_param['h']])) + + keypoints[..., :, :] = keypoints[..., :, :] - keypoints[..., 0, :] + + self.assertTrue( + np.allclose(keypoints[..., :2] / 1000, _keypoints[..., :2])) + + # test with factor + keypoints = (0.1 + 0.8 * np.random.rand(1, 17, 3)) + codec = self.build_pose_lifting_label() + encoded = codec.encode(keypoints, keypoints_visible, lifting_target, + lifting_target_visible, camera_param) + + _keypoints, _ = codec.decode( + encoded['keypoint_labels'], + w=np.array([camera_param['w']]), + h=np.array([camera_param['h']]), + factor=encoded['factor']) + + keypoints *= encoded['factor'] + keypoints[..., :, :] = keypoints[..., :, :] - keypoints[..., 0, :] + + self.assertTrue( + np.allclose(keypoints[..., :2] / 1000, _keypoints[..., :2])) diff --git a/tests/test_codecs/test_video_pose_lifting.py b/tests/test_codecs/test_video_pose_lifting.py index cc58292d0c..31a095e927 100644 --- a/tests/test_codecs/test_video_pose_lifting.py +++ b/tests/test_codecs/test_video_pose_lifting.py @@ -19,7 +19,8 @@ def get_camera_param(self, imgname, camera_param) -> dict: return camera_param[(subj, camera)] def build_pose_lifting_label(self, **kwargs): - cfg = dict(type='VideoPoseLifting', num_keypoints=17) + cfg = dict( + type='VideoPoseLifting', num_keypoints=17, reshape_keypoints=False) cfg.update(kwargs) return KEYPOINT_CODECS.build(cfg) @@ -27,8 +28,12 @@ def setUp(self) -> None: keypoints = (0.1 + 0.8 * np.random.rand(1, 17, 2)) * [192, 256] keypoints = np.round(keypoints).astype(np.float32) keypoints_visible = np.random.randint(2, size=(1, 17)) - lifting_target = (0.1 + 0.8 * np.random.rand(17, 3)) - lifting_target_visible = np.random.randint(2, size=(17, )) + lifting_target = (0.1 + 0.8 * np.random.rand(1, 17, 3)) + lifting_target_visible = np.random.randint( + 2, size=( + 1, + 17, + )) encoded_wo_sigma = np.random.rand(1, 17, 3) camera_param = load('tests/data/h36m/cameras.pkl') @@ -61,10 +66,19 @@ def test_encode(self): lifting_target_visible, camera_param) self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2)) - self.assertEqual(encoded['lifting_target_label'].shape, (17, 3)) - self.assertEqual(encoded['lifting_target_weights'].shape, (17, )) - self.assertEqual(encoded['trajectory_weights'].shape, (17, )) - self.assertEqual(encoded['target_root'].shape, (3, )) + self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3)) + self.assertEqual(encoded['lifting_target_weights'].shape, ( + 1, + 17, + )) + self.assertEqual(encoded['trajectory_weights'].shape, ( + 1, + 17, + )) + self.assertEqual(encoded['target_root'].shape, ( + 1, + 3, + )) # test not zero-centering codec = self.build_pose_lifting_label(zero_center=False) @@ -72,9 +86,31 @@ def test_encode(self): lifting_target_visible, camera_param) self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2)) - self.assertEqual(encoded['lifting_target_label'].shape, (17, 3)) - self.assertEqual(encoded['lifting_target_weights'].shape, (17, )) - self.assertEqual(encoded['trajectory_weights'].shape, (17, )) + self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3)) + self.assertEqual(encoded['lifting_target_weights'].shape, ( + 1, + 17, + )) + self.assertEqual(encoded['trajectory_weights'].shape, ( + 1, + 17, + )) + + # test reshape_keypoints + codec = self.build_pose_lifting_label(reshape_keypoints=True) + encoded = codec.encode(keypoints, keypoints_visible, lifting_target, + lifting_target_visible, camera_param) + + self.assertEqual(encoded['keypoint_labels'].shape, (34, 1)) + self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3)) + self.assertEqual(encoded['lifting_target_weights'].shape, ( + 1, + 17, + )) + self.assertEqual(encoded['trajectory_weights'].shape, ( + 1, + 17, + )) # test removing root codec = self.build_pose_lifting_label( @@ -84,10 +120,16 @@ def test_encode(self): self.assertTrue('target_root_removed' in encoded and 'target_root_index' in encoded) - self.assertEqual(encoded['lifting_target_weights'].shape, (16, )) + self.assertEqual(encoded['lifting_target_weights'].shape, ( + 1, + 16, + )) self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2)) - self.assertEqual(encoded['lifting_target_label'].shape, (16, 3)) - self.assertEqual(encoded['target_root'].shape, (3, )) + self.assertEqual(encoded['lifting_target_label'].shape, (1, 16, 3)) + self.assertEqual(encoded['target_root'].shape, ( + 1, + 3, + )) # test normalizing camera codec = self.build_pose_lifting_label(normalize_camera=True) @@ -102,6 +144,35 @@ def test_encode(self): encoded['camera_param']['f'], atol=4.)) + # test with multiple targets + keypoints = (0.1 + 0.8 * np.random.rand(2, 17, 2)) * [192, 256] + keypoints = np.round(keypoints).astype(np.float32) + keypoints_visible = np.random.randint(2, size=(2, 17)) + lifting_target = (0.1 + 0.8 * np.random.rand(2, 17, 3)) + lifting_target_visible = np.random.randint( + 2, size=( + 2, + 17, + )) + codec = self.build_pose_lifting_label() + encoded = codec.encode(keypoints, keypoints_visible, lifting_target, + lifting_target_visible, camera_param) + + self.assertEqual(encoded['keypoint_labels'].shape, (2, 17, 2)) + self.assertEqual(encoded['lifting_target_label'].shape, (2, 17, 3)) + self.assertEqual(encoded['lifting_target_weights'].shape, ( + 2, + 17, + )) + self.assertEqual(encoded['trajectory_weights'].shape, ( + 2, + 17, + )) + self.assertEqual(encoded['target_root'].shape, ( + 2, + 3, + )) + def test_decode(self): lifting_target = self.data['lifting_target'] encoded_wo_sigma = self.data['encoded_wo_sigma'] @@ -135,12 +206,10 @@ def test_cicular_verification(self): lifting_target_visible, camera_param) _keypoints, _ = codec.decode( - np.expand_dims(encoded['lifting_target_label'], axis=0), + encoded['lifting_target_label'], target_root=lifting_target[..., 0, :]) - self.assertTrue( - np.allclose( - np.expand_dims(lifting_target, axis=0), _keypoints, atol=5.)) + self.assertTrue(np.allclose(lifting_target, _keypoints, atol=5.)) # test removing root codec = self.build_pose_lifting_label(remove_root=True) @@ -148,9 +217,7 @@ def test_cicular_verification(self): lifting_target_visible, camera_param) _keypoints, _ = codec.decode( - np.expand_dims(encoded['lifting_target_label'], axis=0), + encoded['lifting_target_label'], target_root=lifting_target[..., 0, :]) - self.assertTrue( - np.allclose( - np.expand_dims(lifting_target, axis=0), _keypoints, atol=5.)) + self.assertTrue(np.allclose(lifting_target, _keypoints, atol=5.)) diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_h36m_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_h36m_dataset.py index 88944dc11f..fd6cdf5f17 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_h36m_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_h36m_dataset.py @@ -116,6 +116,17 @@ def test_topdown(self): self.assertEqual(len(dataset), 4) self.check_data_info_keys(dataset[0]) + dataset = self.build_h36m_dataset( + data_mode='topdown', + seq_len=1, + seq_step=1, + multiple_target=1, + causal=False, + pad_video_seq=True, + camera_param_file='cameras.pkl') + self.assertEqual(len(dataset), 4) + self.check_data_info_keys(dataset[0]) + # test topdown testing with 2d keypoint detection file and # sequence config dataset = self.build_h36m_dataset( diff --git a/tests/test_datasets/test_transforms/test_pose3d_transforms.py b/tests/test_datasets/test_transforms/test_pose3d_transforms.py index 5f5d5aa096..b87931bb74 100644 --- a/tests/test_datasets/test_transforms/test_pose3d_transforms.py +++ b/tests/test_datasets/test_transforms/test_pose3d_transforms.py @@ -35,7 +35,7 @@ def _parse_h36m_imgname(imgname): scales = data['scale'].astype(np.float32) idx = 0 - target_idx = 0 + target_idx = [0] data_info = { 'keypoints': keypoints[idx, :, :2].reshape(1, -1, 2), @@ -52,7 +52,6 @@ def _parse_h36m_imgname(imgname): 'sample_idx': idx, 'lifting_target': keypoints_3d[target_idx, :, :3], 'lifting_target_visible': keypoints_3d[target_idx, :, 3], - 'target_img_path': osp.join('tests/data/h36m', imgnames[target_idx]), } # add camera parameters @@ -108,9 +107,12 @@ def test_transform(self): tar_vis2 = results['lifting_target_visible'] self.assertEqual(kpts_vis2.shape, (1, 17)) - self.assertEqual(tar_vis2.shape, (17, )) + self.assertEqual(tar_vis2.shape, ( + 1, + 17, + )) self.assertEqual(kpts2.shape, (1, 17, 2)) - self.assertEqual(tar2.shape, (17, 3)) + self.assertEqual(tar2.shape, (1, 17, 3)) flip_indices = [ 0, 4, 5, 6, 1, 2, 3, 7, 8, 9, 10, 14, 15, 16, 11, 12, 13 @@ -121,12 +123,15 @@ def test_transform(self): self.assertTrue( np.allclose(kpts1[0][left][1:], kpts2[0][right][1:], atol=4.)) self.assertTrue( - np.allclose(tar1[left][1:], tar2[right][1:], atol=4.)) + np.allclose( + tar1[..., left, 1:], tar2[..., right, 1:], atol=4.)) self.assertTrue( - np.allclose(kpts_vis1[0][left], kpts_vis2[0][right], atol=4.)) + np.allclose( + kpts_vis1[..., left], kpts_vis2[..., right], atol=4.)) self.assertTrue( - np.allclose(tar_vis1[left], tar_vis2[right], atol=4.)) + np.allclose( + tar_vis1[..., left], tar_vis2[..., right], atol=4.)) # test camera flipping transform = RandomFlipAroundRoot( @@ -148,3 +153,23 @@ def test_transform(self): -self.data_info['camera_param']['p'][0], camera2['p'][0], atol=4.)) + + # test flipping w.r.t. image + transform = RandomFlipAroundRoot({}, {}, flip_prob=1, flip_image=True) + results = deepcopy(self.data_info) + results = transform(results) + kpts2 = results['keypoints'] + tar2 = results['lifting_target'] + + camera_param = results['camera_param'] + for left, right in enumerate(flip_indices): + self.assertTrue( + np.allclose( + camera_param['w'] - kpts1[0][left][:1], + kpts2[0][right][:1], + atol=4.)) + self.assertTrue( + np.allclose(kpts1[0][left][1:], kpts2[0][right][1:], atol=4.)) + self.assertTrue( + np.allclose( + tar1[..., left, 1:], tar2[..., right, 1:], atol=4.)) diff --git a/tests/test_evaluation/test_metrics/test_keypoint_3d_metrics.py b/tests/test_evaluation/test_metrics/test_keypoint_3d_metrics.py index 8289b09d0f..391b7b194a 100644 --- a/tests/test_evaluation/test_metrics/test_keypoint_3d_metrics.py +++ b/tests/test_evaluation/test_metrics/test_keypoint_3d_metrics.py @@ -20,9 +20,10 @@ def setUp(self): for i in range(self.batch_size): gt_instances = InstanceData() keypoints = np.random.random((1, num_keypoints, 3)) - gt_instances.lifting_target = np.random.random((num_keypoints, 3)) + gt_instances.lifting_target = np.random.random( + (1, num_keypoints, 3)) gt_instances.lifting_target_visible = np.ones( - (num_keypoints, 1)).astype(bool) + (1, num_keypoints, 1)).astype(bool) pred_instances = InstanceData() pred_instances.keypoints = keypoints + np.random.normal( @@ -32,8 +33,10 @@ def setUp(self): data_sample = PoseDataSample( gt_instances=gt_instances, pred_instances=pred_instances) data_sample.set_metainfo( - dict(target_img_path='tests/data/h36m/S7/' - 'S7_Greeting.55011271/S7_Greeting.55011271_000396.jpg')) + dict(target_img_path=[ + 'tests/data/h36m/S7/' + 'S7_Greeting.55011271/S7_Greeting.55011271_000396.jpg' + ])) self.data_batch.append(data) self.data_samples.append(data_sample.to_dict()) diff --git a/tests/test_models/test_backbones/test_dstformer.py b/tests/test_models/test_backbones/test_dstformer.py new file mode 100644 index 0000000000..966ed6f49b --- /dev/null +++ b/tests/test_models/test_backbones/test_dstformer.py @@ -0,0 +1,36 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmpose.models.backbones import DSTFormer +from mmpose.models.backbones.dstformer import AttentionBlock + + +class TestDSTFormer(TestCase): + + def test_attention_block(self): + # BasicTemporalBlock with causal == False + block = AttentionBlock(dim=256, num_heads=2) + x = torch.rand(2, 17, 256) + x_out = block(x) + self.assertEqual(x_out.shape, torch.Size([2, 17, 256])) + + def test_DSTFormer(self): + # Test DSTFormer with depth=2 + model = DSTFormer(in_channels=3, depth=2, seq_len=2) + pose3d = torch.rand((1, 2, 17, 3)) + feat = model(pose3d) + self.assertEqual(feat[0].shape, (2, 17, 256)) + + # Test DSTFormer with depth=4 and qkv_bias=False + model = DSTFormer(in_channels=3, depth=4, seq_len=2, qkv_bias=False) + pose3d = torch.rand((1, 2, 17, 3)) + feat = model(pose3d) + self.assertEqual(feat[0].shape, (2, 17, 256)) + + # Test DSTFormer with depth=4 and att_fuse=False + model = DSTFormer(in_channels=3, depth=4, seq_len=2, att_fuse=False) + pose3d = torch.rand((1, 2, 17, 3)) + feat = model(pose3d) + self.assertEqual(feat[0].shape, (2, 17, 256))