From 084212ba55e7f77ca9ae4d3924fd0b68812f5d02 Mon Sep 17 00:00:00 2001 From: Mesopotamia <54797851+yzd-v@users.noreply.github.com> Date: Mon, 28 Aug 2023 18:33:06 +0800 Subject: [PATCH] [Feature] Add DWPose (#2643) --- .github/workflows/merge_stage_test.yml | 4 +- .github/workflows/pr_stage_test.yml | 4 +- .../wholebody_2d_keypoint/dwpose/README.md | 63 +++++ .../s1_dis/dwpose_l_dis_m_coco-256x192.py | 48 ++++ .../s1_dis/dwpose_x_dis_l_coco-384x288.py | 48 ++++ .../s2_dis/dwpose_l-ll_coco-384x288.py | 45 +++ .../s2_dis/dwpose_m-mm_coco-256x192.py | 45 +++ .../dwpose_l_dis_m_coco-ubody-256x192.py | 48 ++++ .../dwpose_l_dis_s_coco-ubody-256x192.py | 48 ++++ .../dwpose_l_dis_t_coco-ubody-256x192.py | 48 ++++ .../dwpose_x_dis_l_coco-ubody-256x192.py | 48 ++++ .../rtmpose_x_dis_l_coco-ubody-384x288.py | 48 ++++ .../s2_dis/dwpose_l-ll_coco-ubody-256x192.py | 45 +++ .../s2_dis/dwpose_l-ll_coco-ubody-384x288.py | 45 +++ .../s2_dis/dwpose_m-mm_coco-ubody-256x192.py | 45 +++ .../s2_dis/dwpose_s-ss_coco-ubody-256x192.py | 45 +++ .../s2_dis/dwpose_t-tt_coco-ubody-256x192.py | 45 +++ ...ose-l_8xb32-270e_coco-wholebody-384x288.py | 2 +- ...ose-l_8xb64-270e_coco-wholebody-256x192.py | 2 +- ...ose-m_8xb64-270e_coco-wholebody-256x192.py | 2 +- ...ose-x_8xb32-270e_coco-wholebody-384x288.py | 233 ++++++++++++++++ ...8xb32-270e_coco-ubody-wholebody-384x288.py | 256 +++++++++++++++++ ...8xb64-270e_coco-ubody-wholebody-256x192.py | 256 +++++++++++++++++ ...8xb64-270e_coco-ubody-wholebody-256x192.py | 256 +++++++++++++++++ ...8xb64-270e_coco-ubody-wholebody-256x192.py | 256 +++++++++++++++++ ...8xb64-270e_coco-ubody-wholebody-256x192.py | 256 +++++++++++++++++ ...8xb32-270e_coco-ubody-wholebody-384x288.py | 260 ++++++++++++++++++ ...8xb64-270e_coco-ubody-wholebody-256x192.py | 260 ++++++++++++++++++ docs/src/papers/algorithms/dwpose.md | 30 ++ mmpose/models/losses/__init__.py | 4 +- mmpose/models/losses/fea_dis_loss.py | 63 +++++ mmpose/models/losses/logit_dis_loss.py | 64 +++++ mmpose/models/pose_estimators/__init__.py | 6 +- .../pose_estimators/dwpose_distiller.py | 231 ++++++++++++++++ .../test_dwpose_distiller.py | 113 ++++++++ tools/misc/pth_transfer.py | 46 ++++ 36 files changed, 3309 insertions(+), 9 deletions(-) create mode 100644 configs/wholebody_2d_keypoint/dwpose/README.md create mode 100644 configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_l_dis_m_coco-256x192.py create mode 100644 configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_x_dis_l_coco-384x288.py create mode 100644 configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_l-ll_coco-384x288.py create mode 100644 configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_m-mm_coco-256x192.py create mode 100644 configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_m_coco-ubody-256x192.py create mode 100644 configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_s_coco-ubody-256x192.py create mode 100644 configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_t_coco-ubody-256x192.py create mode 100644 configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_x_dis_l_coco-ubody-256x192.py create mode 100644 configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/rtmpose_x_dis_l_coco-ubody-384x288.py create mode 100644 configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-256x192.py create mode 100644 configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-384x288.py create mode 100644 configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_m-mm_coco-ubody-256x192.py create mode 100644 configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_s-ss_coco-ubody-256x192.py create mode 100644 configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_t-tt_coco-ubody-256x192.py create mode 100644 configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-x_8xb32-270e_coco-wholebody-384x288.py create mode 100644 configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py create mode 100644 configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py create mode 100644 configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py create mode 100644 configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py create mode 100644 configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py create mode 100644 configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb32-270e_coco-ubody-wholebody-384x288.py create mode 100644 configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb64-270e_coco-ubody-wholebody-256x192.py create mode 100644 docs/src/papers/algorithms/dwpose.md create mode 100644 mmpose/models/losses/fea_dis_loss.py create mode 100644 mmpose/models/losses/logit_dis_loss.py create mode 100644 mmpose/models/pose_estimators/dwpose_distiller.py create mode 100644 tests/test_models/test_pose_estimators/test_dwpose_distiller.py create mode 100644 tools/misc/pth_transfer.py diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml index bb60ad40fa..cd6ef82565 100644 --- a/.github/workflows/merge_stage_test.yml +++ b/.github/workflows/merge_stage_test.yml @@ -208,17 +208,17 @@ jobs: - name: Install mmpose dependencies run: | python -m pip install -U numpy + python -m pip install --upgrade pip setuptools wheel python -m pip install git+https://github.com/open-mmlab/mmengine.git@main python -m pip install -U openmim mim install 'mmcv >= 2.0.0' - python -m pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x + mim install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x python -m pip install -r requirements/tests.txt python -m pip install -r requirements/runtime.txt python -m pip install -r requirements/albu.txt python -m pip install -r requirements/poseval.txt - name: Build and install run: | - python -m pip install --upgrade pip setuptools wheel python -m pip install -e . -v - name: Run unittests and generate coverage report run: | diff --git a/.github/workflows/pr_stage_test.yml b/.github/workflows/pr_stage_test.yml index 5ed6fc8ae7..d1a3089d84 100644 --- a/.github/workflows/pr_stage_test.yml +++ b/.github/workflows/pr_stage_test.yml @@ -178,16 +178,16 @@ jobs: - name: Install mmpose dependencies run: | python -m pip install -U numpy + python -m pip install --upgrade pip setuptools wheel python -m pip install git+https://github.com/open-mmlab/mmengine.git@main python -m pip install -U openmim mim install 'mmcv >= 2.0.0' - python -m pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x + mim install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x python -m pip install -r requirements/tests.txt python -m pip install -r requirements/albu.txt python -m pip install -r requirements/poseval.txt - name: Build and install run: | - python -m pip install --upgrade pip setuptools wheel python -m pip install -e . -v - name: Run unittests and generate coverage report run: | diff --git a/configs/wholebody_2d_keypoint/dwpose/README.md b/configs/wholebody_2d_keypoint/dwpose/README.md new file mode 100644 index 0000000000..d85cb48c53 --- /dev/null +++ b/configs/wholebody_2d_keypoint/dwpose/README.md @@ -0,0 +1,63 @@ +# DWPose + +Whole-body pose estimation localizes the human body, hand, face, and foot keypoints in an image. This task is challenging due to multi-scale body parts, fine-grained localization for low-resolution regions, and data scarcity. Meanwhile, applying a highly efficient and accurate pose estimator to widely human-centric understanding and generation tasks is urgent. In this work, we present a two-stage pose **D**istillation for **W**hole-body **P**ose estimators, named **DWPose**, to improve their effectiveness and efficiency. The first-stage distillation designs a weight-decay strategy while utilizing a teacher's intermediate feature and final logits with both visible and invisible keypoints to supervise the student from scratch. The second stage distills the student model itself to further improve performance. Different from the previous self-knowledge distillation, this stage finetunes the student's head with only 20% training time as a plug-and-play training strategy. For data limitations, we explore the UBody dataset that contains diverse facial expressions and hand gestures for real-life applications. Comprehensive experiments show the superiority of our proposed simple yet effective methods. We achieve new state-of-the-art performance on COCO-WholeBody, significantly boosting the whole-body AP of RTMPose-l from 64.8% to 66.5%, even surpassing RTMPose-x teacher with 65.3% AP. We release a series of models with different sizes, from tiny to large, for satisfying various downstream tasks. + +## Results and Models + +### COCO-WholeBody Dataset + +Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset + +- DWPose Models are supported by [DWPose](https://github.com/IDEA-Research/DWPose) +- Models are trained and distilled on the following datasets: + - [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody/) + - [UBody](https://github.com/IDEA-Research/OSX) + +| Config | S1 Dis_config | S2 Dis_config | Input Size | Whole AP | Whole AR | FLOPS
(G) | ORT-Latency
(ms)
(i7-11700) | TRT-FP16-Latency
(ms)
(GTX 1660Ti) | Download | +| :----------- | :-----------------: | :-----------------: | :--------: | :------: | :------: | :---------------: | :-----------------------------------------: | :------------------------------------------------: | :------------: | +| [DWPose-t](../rtmpose/ubody/rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py) | [DW l-t](../dwpose/ubody/s1_dis/dwpose_l_dis_t_coco-ubody-256x192.py) | [DW t-t](../dwpose/ubody/s2_dis/dwpose_t-tt_coco-ubody-256x192.py) | 256x192 | 48.5 | 58.4 | 0.5 | - | - | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-ucoco_dw-ucoco_270e-256x192-dcf277bf_20230728.pth) | +| [DWPose-s](../rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py) | [DW l-s](../dwpose/ubody/s1_dis/dwpose_l_dis_s_coco-ubody-256x192.py) | [DW s-s](../dwpose/ubody/s2_dis/dwpose_s-ss_coco-ubody-256x192.py) | 256x192 | 53.8 | 63.2 | 0.9 | - | - | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-ucoco_dw-ucoco_270e-256x192-3fd922c8_20230728.pth) | +| [DWPose-m](../rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py) | [DW l-m](../dwpose/ubody/s1_dis/dwpose_l_dis_m_coco-ubody-256x192.py) | [DW m-m](../dwpose/ubody/s2_dis/dwpose_m-mm_coco-ubody-256x192.py) | 256x192 | 60.6 | 69.5 | 2.22 | 13.50 | 4.00 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-ucoco_dw-ucoco_270e-256x192-c8b76419_20230728.pth) | +| [DWPose-l](../rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py) | [DW x-l](../dwpose/ubody/s1_dis/dwpose_x_dis_l_coco-ubody-256x192.py) | [DW l-l](../dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-256x192.py) | 256x192 | 63.1 | 71.7 | 4.52 | 23.41 | 5.67 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth) | +| [DWPose-l](../rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py) | [DW x-l](../dwpose/ubody/s1_dis/dwpose_x_dis_l_coco-ubody-384x288.py) | [DW l-l](../dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-384x288.py) | 384x288 | 66.5 | 74.3 | 10.07 | 44.58 | 7.68 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-ucoco_dw-ucoco_270e-384x288-2438fd99_20230728.pth) | + +## Train a model + +### Train DWPose with the first stage distillation + +``` +bash tools/dist_train.sh configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/rtmpose_x_dis_l_coco-ubody-384x288.py 8 +``` + +### Tansfer the S1 distillation models into regular models + +``` +# first stage distillation +python pth_transfer.py $dis_ckpt $new_pose_ckpt +``` + +⭐Before S2 distillation, you should add your model path into 'teacher_pretrained' of your S2 dis_config. + +### Train DWPose with the second stage distillation + +``` +bash tools/dist_train.sh configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-384x288.py 8 +``` + +### Tansfer the S2 distillation models into regular models + +``` +# second stage distillation +python pth_transfer.py $dis_ckpt $new_pose_ckpt --two_dis +``` + +## Citation + +``` +@article{yang2023effective, + title={Effective Whole-body Pose Estimation with Two-stages Distillation}, + author={Yang, Zhendong and Zeng, Ailing and Yuan, Chun and Li, Yu}, + journal={arXiv preprint arXiv:2307.15880}, + year={2023} +} +``` diff --git a/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_l_dis_m_coco-256x192.py b/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_l_dis_m_coco-256x192.py new file mode 100644 index 0000000000..422871acbb --- /dev/null +++ b/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_l_dis_m_coco-256x192.py @@ -0,0 +1,48 @@ +_base_ = [ + '../../../rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = False + +# config settings +fea = True +logit = True + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-256x192-6f206314_20230124.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/' + 'rtmpose-l_8xb64-270e_coco-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/' + 'rtmpose-m_8xb64-270e_coco-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='FeaLoss', + name='loss_fea', + use_this=fea, + student_channels=768, + teacher_channels=1024, + alpha_fea=0.00007, + ) + ]), + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=0.1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), +) +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_x_dis_l_coco-384x288.py b/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_x_dis_l_coco-384x288.py new file mode 100644 index 0000000000..150cb2bbe6 --- /dev/null +++ b/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_x_dis_l_coco-384x288.py @@ -0,0 +1,48 @@ +_base_ = [ + '../../../rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py' # noqa: E501 +] + +# model settings +find_unused_parameters = False + +# config settings +fea = True +logit = True + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-x_simcc-coco-wholebody_pt-body7_270e-384x288-401dfc90_20230629.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/' + 'rtmpose-x_8xb32-270e_coco-wholebody-384x288.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/' + 'rtmpose-l_8xb32-270e_coco-wholebody-384x288.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='FeaLoss', + name='loss_fea', + use_this=fea, + student_channels=1024, + teacher_channels=1280, + alpha_fea=0.00007, + ) + ]), + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=0.1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), +) +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_l-ll_coco-384x288.py b/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_l-ll_coco-384x288.py new file mode 100644 index 0000000000..6c63f99b0c --- /dev/null +++ b/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_l-ll_coco-384x288.py @@ -0,0 +1,45 @@ +_base_ = [ + '../../../rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py' # noqa: E501 +] + +# model settings +find_unused_parameters = True + +# dis settings +second_dis = True + +# config settings +logit = True + +train_cfg = dict(max_epochs=60, val_interval=10) + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + two_dis=second_dis, + teacher_pretrained='work_dirs/' + 'dwpose_x_dis_l_coco-384x288/dw-x-l_coco_384.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/' + 'rtmpose-l_8xb32-270e_coco-wholebody-384x288.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/' + 'rtmpose-l_8xb32-270e_coco-wholebody-384x288.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + train_cfg=train_cfg, +) + +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_m-mm_coco-256x192.py b/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_m-mm_coco-256x192.py new file mode 100644 index 0000000000..943ec60184 --- /dev/null +++ b/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_m-mm_coco-256x192.py @@ -0,0 +1,45 @@ +_base_ = [ + '../../../rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = True + +# dis settings +second_dis = True + +# config settings +logit = True + +train_cfg = dict(max_epochs=60, val_interval=10) + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + two_dis=second_dis, + teacher_pretrained='work_dirs/' + 'dwpose_l_dis_m_coco-256x192/dw-l-m_coco_256.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/' + 'rtmpose-m_8xb64-270e_coco-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/' + 'rtmpose-m_8xb64-270e_coco-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + train_cfg=train_cfg, +) + +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_m_coco-ubody-256x192.py b/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_m_coco-ubody-256x192.py new file mode 100644 index 0000000000..b3a917b96e --- /dev/null +++ b/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_m_coco-ubody-256x192.py @@ -0,0 +1,48 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = False + +# config settings +fea = True +logit = True + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-l_ucoco_256x192-95bb32f5_20230822.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='FeaLoss', + name='loss_fea', + use_this=fea, + student_channels=768, + teacher_channels=1024, + alpha_fea=0.00007, + ) + ]), + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=0.1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), +) +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_s_coco-ubody-256x192.py b/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_s_coco-ubody-256x192.py new file mode 100644 index 0000000000..c90a0ea6a7 --- /dev/null +++ b/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_s_coco-ubody-256x192.py @@ -0,0 +1,48 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = False + +# config settings +fea = True +logit = True + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-l_ucoco_256x192-95bb32f5_20230822.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='FeaLoss', + name='loss_fea', + use_this=fea, + student_channels=512, + teacher_channels=1024, + alpha_fea=0.00007, + ) + ]), + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=0.1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), +) +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_t_coco-ubody-256x192.py b/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_t_coco-ubody-256x192.py new file mode 100644 index 0000000000..01618f146a --- /dev/null +++ b/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_t_coco-ubody-256x192.py @@ -0,0 +1,48 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = False + +# config settings +fea = True +logit = True + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-l_ucoco_256x192-95bb32f5_20230822.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='FeaLoss', + name='loss_fea', + use_this=fea, + student_channels=384, + teacher_channels=1024, + alpha_fea=0.00007, + ) + ]), + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=0.1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), +) +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_x_dis_l_coco-ubody-256x192.py b/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_x_dis_l_coco-ubody-256x192.py new file mode 100644 index 0000000000..85a287324b --- /dev/null +++ b/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_x_dis_l_coco-ubody-256x192.py @@ -0,0 +1,48 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = False + +# config settings +fea = True +logit = True + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-x_ucoco_256x192-05f5bcb7_20230822.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-x_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='FeaLoss', + name='loss_fea', + use_this=fea, + student_channels=1024, + teacher_channels=1280, + alpha_fea=0.00007, + ) + ]), + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=0.1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), +) +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/rtmpose_x_dis_l_coco-ubody-384x288.py b/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/rtmpose_x_dis_l_coco-ubody-384x288.py new file mode 100644 index 0000000000..acde64a03a --- /dev/null +++ b/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/rtmpose_x_dis_l_coco-ubody-384x288.py @@ -0,0 +1,48 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py' # noqa: E501 +] + +# model settings +find_unused_parameters = False + +# config settings +fea = True +logit = True + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-x_ucoco_384x288-f5b50679_20230822.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-x_8xb32-270e_coco-ubody-wholebody-384x288.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='FeaLoss', + name='loss_fea', + use_this=fea, + student_channels=1024, + teacher_channels=1280, + alpha_fea=0.00007, + ) + ]), + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=0.1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), +) +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-256x192.py b/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-256x192.py new file mode 100644 index 0000000000..e3f456a2b9 --- /dev/null +++ b/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-256x192.py @@ -0,0 +1,45 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = True + +# dis settings +second_dis = True + +# config settings +logit = True + +train_cfg = dict(max_epochs=60, val_interval=10) + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + two_dis=second_dis, + teacher_pretrained='work_dirs/' + 'dwpose_x_dis_l_coco-ubody-256x192/dw-x-l_ucoco_256.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + train_cfg=train_cfg, +) + +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-384x288.py b/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-384x288.py new file mode 100644 index 0000000000..3815fad1e2 --- /dev/null +++ b/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-384x288.py @@ -0,0 +1,45 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py' # noqa: E501 +] + +# model settings +find_unused_parameters = True + +# dis settings +second_dis = True + +# config settings +logit = True + +train_cfg = dict(max_epochs=60, val_interval=10) + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + two_dis=second_dis, + teacher_pretrained='work_dirs/' + 'dwpose_x_dis_l_coco-ubody-384x288/dw-x-l_ucoco_384.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + train_cfg=train_cfg, +) + +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_m-mm_coco-ubody-256x192.py b/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_m-mm_coco-ubody-256x192.py new file mode 100644 index 0000000000..1e6834ffca --- /dev/null +++ b/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_m-mm_coco-ubody-256x192.py @@ -0,0 +1,45 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = True + +# dis settings +second_dis = True + +# config settings +logit = True + +train_cfg = dict(max_epochs=60, val_interval=10) + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + two_dis=second_dis, + teacher_pretrained='work_dirs/' + 'dwpose_l_dis_m_coco-ubody-256x192/dw-l-m_ucoco_256.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + train_cfg=train_cfg, +) + +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_s-ss_coco-ubody-256x192.py b/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_s-ss_coco-ubody-256x192.py new file mode 100644 index 0000000000..24a4a94642 --- /dev/null +++ b/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_s-ss_coco-ubody-256x192.py @@ -0,0 +1,45 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = True + +# dis settings +second_dis = True + +# config settings +logit = True + +train_cfg = dict(max_epochs=60, val_interval=10) + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + two_dis=second_dis, + teacher_pretrained='work_dirs/' + 'dwpose_l_dis_s_coco-ubody-256x192/dw-l-s_ucoco_256.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + train_cfg=train_cfg, +) + +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_t-tt_coco-ubody-256x192.py b/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_t-tt_coco-ubody-256x192.py new file mode 100644 index 0000000000..c7c322ece2 --- /dev/null +++ b/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_t-tt_coco-ubody-256x192.py @@ -0,0 +1,45 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = True + +# dis settings +second_dis = True + +# config settings +logit = True + +train_cfg = dict(max_epochs=60, val_interval=10) + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + two_dis=second_dis, + teacher_pretrained='work_dirs/' + 'dwpose_l_dis_t_coco-ubody-256x192/dw-l-t_ucoco_256.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + train_cfg=train_cfg, +) + +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py b/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py index af2c133f22..39a6ff79d7 100644 --- a/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py +++ b/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py @@ -68,7 +68,7 @@ type='Pretrained', prefix='backbone.', checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' - 'rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa + 'rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa: E501 )), head=dict( type='RTMCCHead', diff --git a/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py b/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py index 7765c9ec44..9f32f25777 100644 --- a/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py +++ b/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py @@ -68,7 +68,7 @@ type='Pretrained', prefix='backbone.', checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' - 'rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa + 'rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa: E501 )), head=dict( type='RTMCCHead', diff --git a/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py b/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py index 1e2afc518d..8c8c92d5f7 100644 --- a/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py +++ b/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py @@ -68,7 +68,7 @@ type='Pretrained', prefix='backbone.', checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' - 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa: E501 )), head=dict( type='RTMCCHead', diff --git a/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-x_8xb32-270e_coco-wholebody-384x288.py b/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-x_8xb32-270e_coco-wholebody-384x288.py new file mode 100644 index 0000000000..55b11c419a --- /dev/null +++ b/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-x_8xb32-270e_coco-wholebody-384x288.py @@ -0,0 +1,233 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# common setting +num_keypoints = 133 +input_size = (288, 384) + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 32 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=input_size, + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1.33, + widen_factor=1.25, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-x_udp-body7_210e-384x288-d28b58e6_20230529.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=1280, + out_channels=num_keypoints, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py b/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py new file mode 100644 index 0000000000..203766402c --- /dev/null +++ b/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py @@ -0,0 +1,256 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 32 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(288, 384), + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=(9, 12), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'UBody2dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +backend_args = dict(backend='local') + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [ + dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[]) +] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/train_annotations.json', + data_prefix=dict(img='images/'), + pipeline=[], + sample_interval=10) + train_datasets.append(train_dataset) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py b/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py new file mode 100644 index 0000000000..66c42ad8a8 --- /dev/null +++ b/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py @@ -0,0 +1,256 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 64 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'UBody2dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +backend_args = dict(backend='local') + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [ + dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[]) +] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/train_annotations.json', + data_prefix=dict(img='images/'), + pipeline=[], + sample_interval=10) + train_datasets.append(train_dataset) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py b/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py new file mode 100644 index 0000000000..0856fbbe9b --- /dev/null +++ b/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py @@ -0,0 +1,256 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 64 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'UBody2dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +backend_args = dict(backend='local') + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [ + dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[]) +] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/train_annotations.json', + data_prefix=dict(img='images/'), + pipeline=[], + sample_interval=10) + train_datasets.append(train_dataset) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py b/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py new file mode 100644 index 0000000000..66562ee867 --- /dev/null +++ b/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py @@ -0,0 +1,256 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 64 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=512, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'UBody2dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +backend_args = dict(backend='local') + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [ + dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[]) +] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/train_annotations.json', + data_prefix=dict(img='images/'), + pipeline=[], + sample_interval=10) + train_datasets.append(train_dataset) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py b/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py new file mode 100644 index 0000000000..beb10b16f3 --- /dev/null +++ b/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py @@ -0,0 +1,256 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 64 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.167, + widen_factor=0.375, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=384, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'UBody2dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +backend_args = dict(backend='local') + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [ + dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[]) +] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/train_annotations.json', + data_prefix=dict(img='images/'), + pipeline=[], + sample_interval=10) + train_datasets.append(train_dataset) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb32-270e_coco-ubody-wholebody-384x288.py b/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb32-270e_coco-ubody-wholebody-384x288.py new file mode 100644 index 0000000000..695f640897 --- /dev/null +++ b/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb32-270e_coco-ubody-wholebody-384x288.py @@ -0,0 +1,260 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# common setting +num_keypoints = 133 +input_size = (288, 384) + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 32 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=input_size, + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1.33, + widen_factor=1.25, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-x_udp-body7_210e-384x288-d28b58e6_20230529.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=1280, + out_channels=num_keypoints, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'UBody2dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +backend_args = dict(backend='local') + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [ + dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[]) +] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/train_annotations.json', + data_prefix=dict(img='images/'), + pipeline=[], + sample_interval=10) + train_datasets.append(train_dataset) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb64-270e_coco-ubody-wholebody-256x192.py b/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb64-270e_coco-ubody-wholebody-256x192.py new file mode 100644 index 0000000000..30f1015394 --- /dev/null +++ b/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb64-270e_coco-ubody-wholebody-256x192.py @@ -0,0 +1,260 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# common setting +num_keypoints = 133 +input_size = (192, 256) + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 64 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1.33, + widen_factor=1.25, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-x_udp-body7_210e-384x288-d28b58e6_20230529.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=1280, + out_channels=num_keypoints, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'UBody2dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +backend_args = dict(backend='local') + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [ + dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[]) +] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/train_annotations.json', + data_prefix=dict(img='images/'), + pipeline=[], + sample_interval=10) + train_datasets.append(train_dataset) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/docs/src/papers/algorithms/dwpose.md b/docs/src/papers/algorithms/dwpose.md new file mode 100644 index 0000000000..4fd23effdc --- /dev/null +++ b/docs/src/papers/algorithms/dwpose.md @@ -0,0 +1,30 @@ +# Effective Whole-body Pose Estimation with Two-stages Distillation + + + +
+RTMPose (arXiv'2023) + +```bibtex +@article{yang2023effective, + title={Effective Whole-body Pose Estimation with Two-stages Distillation}, + author={Yang, Zhendong and Zeng, Ailing and Yuan, Chun and Li, Yu}, + journal={arXiv preprint arXiv:2307.15880}, + year={2023} +} + +``` + +
+ +## Abstract + + + +Whole-body pose estimation localizes the human body, hand, face, and foot keypoints in an image. This task is challenging due to multi-scale body parts, fine-grained localization for low-resolution regions, and data scarcity. Meanwhile, applying a highly efficient and accurate pose estimator to widely human-centric understanding and generation tasks is urgent. In this work, we present a two-stage pose **D**istillation for **W**hole-body **P**ose estimators, named **DWPose**, to improve their effectiveness and efficiency. The first-stage distillation designs a weight-decay strategy while utilizing a teacher's intermediate feature and final logits with both visible and invisible keypoints to supervise the student from scratch. The second stage distills the student model itself to further improve performance. Different from the previous self-knowledge distillation, this stage finetunes the student's head with only 20% training time as a plug-and-play training strategy. For data limitations, we explore the UBody dataset that contains diverse facial expressions and hand gestures for real-life applications. Comprehensive experiments show the superiority of our proposed simple yet effective methods. We achieve new state-of-the-art performance on COCO-WholeBody, significantly boosting the whole-body AP of RTMPose-l from 64.8% to 66.5%, even surpassing RTMPose-x teacher with 65.3% AP. We release a series of models with different sizes, from tiny to large, for satisfying various downstream tasks. Our code and models are available at https://github.com/IDEA-Research/DWPose. + + + +
+ +
diff --git a/mmpose/models/losses/__init__.py b/mmpose/models/losses/__init__.py index 523e4df133..57ba98fe46 100644 --- a/mmpose/models/losses/__init__.py +++ b/mmpose/models/losses/__init__.py @@ -1,8 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. from .ae_loss import AssociativeEmbeddingLoss from .classification_loss import BCELoss, JSDiscretLoss, KLDiscretLoss +from .fea_dis_loss import FeaLoss from .heatmap_loss import (AdaptiveWingLoss, KeypointMSELoss, KeypointOHKMMSELoss) +from .logit_dis_loss import KDLoss from .loss_wrappers import CombinedLoss, MultipleLossWrapper from .regression_loss import (BoneLoss, L1Loss, MPJPELoss, MPJPEVelocityJointLoss, MSELoss, RLELoss, @@ -15,5 +17,5 @@ 'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss', 'RLELoss', 'KLDiscretLoss', 'MultipleLossWrapper', 'JSDiscretLoss', 'CombinedLoss', 'AssociativeEmbeddingLoss', 'SoftWeightSmoothL1Loss', - 'MPJPEVelocityJointLoss' + 'MPJPEVelocityJointLoss', 'FeaLoss', 'KDLoss' ] diff --git a/mmpose/models/losses/fea_dis_loss.py b/mmpose/models/losses/fea_dis_loss.py new file mode 100644 index 0000000000..b90ca9d24f --- /dev/null +++ b/mmpose/models/losses/fea_dis_loss.py @@ -0,0 +1,63 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn + +from mmpose.registry import MODELS + + +@MODELS.register_module() +class FeaLoss(nn.Module): + """PyTorch version of feature-based distillation from DWPose Modified from + the official implementation. + + + Args: + student_channels(int): Number of channels in the student's feature map. + teacher_channels(int): Number of channels in the teacher's feature map. + alpha_fea (float, optional): Weight of dis_loss. Defaults to 0.00007 + """ + + def __init__( + self, + name, + use_this, + student_channels, + teacher_channels, + alpha_fea=0.00007, + ): + super(FeaLoss, self).__init__() + self.alpha_fea = alpha_fea + + if teacher_channels != student_channels: + self.align = nn.Conv2d( + student_channels, + teacher_channels, + kernel_size=1, + stride=1, + padding=0) + else: + self.align = None + + def forward(self, preds_S, preds_T): + """Forward function. + + Args: + preds_S(Tensor): Bs*C*H*W, student's feature map + preds_T(Tensor): Bs*C*H*W, teacher's feature map + """ + + if self.align is not None: + outs = self.align(preds_S) + else: + outs = preds_S + + loss = self.get_dis_loss(outs, preds_T) + + return loss + + def get_dis_loss(self, preds_S, preds_T): + loss_mse = nn.MSELoss(reduction='sum') + N, C, H, W = preds_T.shape + + dis_loss = loss_mse(preds_S, preds_T) / N * self.alpha_fea + + return dis_loss diff --git a/mmpose/models/losses/logit_dis_loss.py b/mmpose/models/losses/logit_dis_loss.py new file mode 100644 index 0000000000..32906a1c3f --- /dev/null +++ b/mmpose/models/losses/logit_dis_loss.py @@ -0,0 +1,64 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + +from mmpose.registry import MODELS + + +@MODELS.register_module() +class KDLoss(nn.Module): + """PyTorch version of logit-based distillation from DWPose Modified from + the official implementation. + + + Args: + weight (float, optional): Weight of dis_loss. Defaults to 1.0 + """ + + def __init__( + self, + name, + use_this, + weight=1.0, + ): + super(KDLoss, self).__init__() + + self.log_softmax = nn.LogSoftmax(dim=1) + self.kl_loss = nn.KLDivLoss(reduction='none') + self.weight = weight + + def forward(self, pred, pred_t, beta, target_weight): + ls_x, ls_y = pred + lt_x, lt_y = pred_t + + lt_x = lt_x.detach() + lt_y = lt_y.detach() + + num_joints = ls_x.size(1) + loss = 0 + + loss += (self.loss(ls_x, lt_x, beta, target_weight)) + loss += (self.loss(ls_y, lt_y, beta, target_weight)) + + return loss / num_joints + + def loss(self, logit_s, logit_t, beta, weight): + + N = logit_s.shape[0] + + if len(logit_s.shape) == 3: + K = logit_s.shape[1] + logit_s = logit_s.reshape(N * K, -1) + logit_t = logit_t.reshape(N * K, -1) + + # N*W(H) + s_i = self.log_softmax(logit_s * beta) + t_i = F.softmax(logit_t * beta, dim=1) + + # kd + loss_all = torch.sum(self.kl_loss(s_i, t_i), dim=1) + loss_all = loss_all.reshape(N, K).sum(dim=1).mean() + loss_all = self.weight * loss_all + + return loss_all diff --git a/mmpose/models/pose_estimators/__init__.py b/mmpose/models/pose_estimators/__init__.py index c5287e0c2c..4cd3884cc2 100644 --- a/mmpose/models/pose_estimators/__init__.py +++ b/mmpose/models/pose_estimators/__init__.py @@ -1,6 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. from .bottomup import BottomupPoseEstimator +from .dwpose_distiller import DWPoseDistiller from .pose_lifter import PoseLifter from .topdown import TopdownPoseEstimator -__all__ = ['TopdownPoseEstimator', 'BottomupPoseEstimator', 'PoseLifter'] +__all__ = [ + 'TopdownPoseEstimator', 'BottomupPoseEstimator', 'PoseLifter', + 'DWPoseDistiller' +] diff --git a/mmpose/models/pose_estimators/dwpose_distiller.py b/mmpose/models/pose_estimators/dwpose_distiller.py new file mode 100644 index 0000000000..dd57d09991 --- /dev/null +++ b/mmpose/models/pose_estimators/dwpose_distiller.py @@ -0,0 +1,231 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta +from typing import Tuple + +import torch +import torch.nn as nn +from mmengine.config import Config +from mmengine.logging import MessageHub +from mmengine.model import BaseModel +from mmengine.runner.checkpoint import load_checkpoint +from torch import Tensor + +from mmpose.evaluation.functional import simcc_pck_accuracy +from mmpose.models import build_pose_estimator +from mmpose.registry import MODELS +from mmpose.utils.tensor_utils import to_numpy +from mmpose.utils.typing import (ForwardResults, OptConfigType, OptMultiConfig, + OptSampleList, SampleList) + + +@MODELS.register_module() +class DWPoseDistiller(BaseModel, metaclass=ABCMeta): + """Base distiller for detectors. + + It typically consists of teacher_model and student_model. + """ + + def __init__(self, + teacher_cfg, + student_cfg, + two_dis=False, + distill_cfg=None, + teacher_pretrained=None, + train_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + data_preprocessor=data_preprocessor, init_cfg=init_cfg) + + self.teacher = build_pose_estimator( + (Config.fromfile(teacher_cfg)).model) + self.teacher_pretrained = teacher_pretrained + self.teacher.eval() + for param in self.teacher.parameters(): + param.requires_grad = False + + self.student = build_pose_estimator( + (Config.fromfile(student_cfg)).model) + + self.distill_cfg = distill_cfg + self.distill_losses = nn.ModuleDict() + if self.distill_cfg is not None: + for item_loc in distill_cfg: + for item_loss in item_loc.methods: + loss_name = item_loss.name + use_this = item_loss.use_this + if use_this: + self.distill_losses[loss_name] = MODELS.build( + item_loss) + + self.two_dis = two_dis + self.train_cfg = train_cfg if train_cfg else self.student.train_cfg + self.test_cfg = self.student.test_cfg + self.metainfo = self.student.metainfo + + def init_weights(self): + if self.teacher_pretrained is not None: + load_checkpoint( + self.teacher, self.teacher_pretrained, map_location='cpu') + self.student.init_weights() + + def set_epoch(self): + self.message_hub = MessageHub.get_current_instance() + self.epoch = self.message_hub.get_info('epoch') + self.max_epochs = self.message_hub.get_info('max_epochs') + + def forward(self, + inputs: torch.Tensor, + data_samples: OptSampleList, + mode: str = 'tensor') -> ForwardResults: + if mode == 'loss': + return self.loss(inputs, data_samples) + elif mode == 'predict': + # use customed metainfo to override the default metainfo + if self.metainfo is not None: + for data_sample in data_samples: + data_sample.set_metainfo(self.metainfo) + return self.predict(inputs, data_samples) + elif mode == 'tensor': + return self._forward(inputs) + else: + raise RuntimeError(f'Invalid mode "{mode}". ' + 'Only supports loss, predict and tensor mode.') + + def loss(self, inputs: Tensor, data_samples: SampleList) -> dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + inputs (Tensor): Inputs with shape (N, C, H, W). + data_samples (List[:obj:`PoseDataSample`]): The batch + data samples. + + Returns: + dict: A dictionary of losses. + """ + self.set_epoch() + + losses = dict() + + with torch.no_grad(): + fea_t = self.teacher.extract_feat(inputs) + lt_x, lt_y = self.teacher.head(fea_t) + pred_t = (lt_x, lt_y) + + if not self.two_dis: + fea_s = self.student.extract_feat(inputs) + ori_loss, pred, gt, target_weight = self.head_loss( + fea_s, data_samples, train_cfg=self.train_cfg) + losses.update(ori_loss) + else: + ori_loss, pred, gt, target_weight = self.head_loss( + fea_t, data_samples, train_cfg=self.train_cfg) + + all_keys = self.distill_losses.keys() + + if 'loss_fea' in all_keys: + loss_name = 'loss_fea' + losses[loss_name] = self.distill_losses[loss_name](fea_s[-1], + fea_t[-1]) + if not self.two_dis: + losses[loss_name] = ( + 1 - self.epoch / self.max_epochs) * losses[loss_name] + + if 'loss_logit' in all_keys: + loss_name = 'loss_logit' + losses[loss_name] = self.distill_losses[loss_name]( + pred, pred_t, self.student.head.loss_module.beta, + target_weight) + if not self.two_dis: + losses[loss_name] = ( + 1 - self.epoch / self.max_epochs) * losses[loss_name] + + return losses + + def predict(self, inputs, data_samples): + if self.two_dis: + assert self.student.with_head, ( + 'The model must have head to perform prediction.') + + if self.test_cfg.get('flip_test', False): + _feats = self.extract_feat(inputs) + _feats_flip = self.extract_feat(inputs.flip(-1)) + feats = [_feats, _feats_flip] + else: + feats = self.extract_feat(inputs) + + preds = self.student.head.predict( + feats, data_samples, test_cfg=self.student.test_cfg) + + if isinstance(preds, tuple): + batch_pred_instances, batch_pred_fields = preds + else: + batch_pred_instances = preds + batch_pred_fields = None + + results = self.student.add_pred_to_datasample( + batch_pred_instances, batch_pred_fields, data_samples) + + return results + else: + return self.student.predict(inputs, data_samples) + + def extract_feat(self, inputs: Tensor) -> Tuple[Tensor]: + x = self.teacher.extract_feat(inputs) + if self.student.with_neck: + x = self.neck(x) + + return x + + def head_loss( + self, + feats: Tuple[Tensor], + batch_data_samples: OptSampleList, + train_cfg: OptConfigType = {}, + ) -> dict: + """Calculate losses from a batch of inputs and data samples.""" + + pred_x, pred_y = self.student.head.forward(feats) + + gt_x = torch.cat([ + d.gt_instance_labels.keypoint_x_labels for d in batch_data_samples + ], + dim=0) + gt_y = torch.cat([ + d.gt_instance_labels.keypoint_y_labels for d in batch_data_samples + ], + dim=0) + keypoint_weights = torch.cat( + [ + d.gt_instance_labels.keypoint_weights + for d in batch_data_samples + ], + dim=0, + ) + + pred_simcc = (pred_x, pred_y) + gt_simcc = (gt_x, gt_y) + + # calculate losses + losses = dict() + loss = self.student.head.loss_module(pred_simcc, gt_simcc, + keypoint_weights) + + losses.update(loss_kpt=loss) + + # calculate accuracy + _, avg_acc, _ = simcc_pck_accuracy( + output=to_numpy(pred_simcc), + target=to_numpy(gt_simcc), + simcc_split_ratio=self.student.head.simcc_split_ratio, + mask=to_numpy(keypoint_weights) > 0, + ) + + acc_pose = torch.tensor(avg_acc, device=gt_x.device) + losses.update(acc_pose=acc_pose) + + return losses, pred_simcc, gt_simcc, keypoint_weights + + def _forward(self, inputs: Tensor): + + return self.student._forward(inputs) diff --git a/tests/test_models/test_pose_estimators/test_dwpose_distiller.py b/tests/test_models/test_pose_estimators/test_dwpose_distiller.py new file mode 100644 index 0000000000..60d2b231e5 --- /dev/null +++ b/tests/test_models/test_pose_estimators/test_dwpose_distiller.py @@ -0,0 +1,113 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import unittest +from unittest import TestCase + +import torch +from mmengine.model.utils import revert_sync_batchnorm +from parameterized import parameterized + +from mmpose.structures import PoseDataSample +from mmpose.testing import get_packed_inputs, get_pose_estimator_cfg +from mmpose.utils import register_all_modules + +configs = [ + 'wholebody_2d_keypoint/dwpose/ubody/' + 's1_dis/dwpose_l_dis_m_coco-ubody-256x192.py', + 'wholebody_2d_keypoint/dwpose/ubody/' + 's2_dis/dwpose_m-mm_coco-ubody-256x192.py', + 'wholebody_2d_keypoint/dwpose/coco-wholebody/' + 's1_dis/dwpose_l_dis_m_coco-256x192.py', + 'wholebody_2d_keypoint/dwpose/coco-wholebody/' + 's2_dis/dwpose_m-mm_coco-256x192.py', +] + +configs_with_devices = [(config, ('cpu', 'cuda')) for config in configs] + + +class TestDWPoseDistiller(TestCase): + + def setUp(self) -> None: + register_all_modules() + + @parameterized.expand(configs) + def test_init(self, config): + dis_cfg = get_pose_estimator_cfg(config) + model_cfg = get_pose_estimator_cfg(dis_cfg.student_cfg) + model_cfg.backbone.init_cfg = None + + from mmpose.models import build_pose_estimator + model = build_pose_estimator(model_cfg) + model = revert_sync_batchnorm(model) + self.assertTrue(model.backbone) + self.assertTrue(model.head) + if model_cfg.get('neck', None): + self.assertTrue(model.neck) + + @parameterized.expand(configs_with_devices) + def test_forward_loss(self, config, devices): + dis_cfg = get_pose_estimator_cfg(config) + model_cfg = get_pose_estimator_cfg(dis_cfg.student_cfg) + model_cfg.backbone.init_cfg = None + + from mmpose.models import build_pose_estimator + + for device in devices: + model = build_pose_estimator(model_cfg) + model = revert_sync_batchnorm(model) + + if device == 'cuda': + if not torch.cuda.is_available(): + return unittest.skip('test requires GPU and torch+cuda') + model = model.cuda() + + packed_inputs = get_packed_inputs(2, num_keypoints=133) + data = model.data_preprocessor(packed_inputs, training=True) + losses = model.forward(**data, mode='loss') + self.assertIsInstance(losses, dict) + + @parameterized.expand(configs_with_devices) + def test_forward_predict(self, config, devices): + dis_cfg = get_pose_estimator_cfg(config) + model_cfg = get_pose_estimator_cfg(dis_cfg.student_cfg) + model_cfg.backbone.init_cfg = None + + from mmpose.models import build_pose_estimator + + for device in devices: + model = build_pose_estimator(model_cfg) + model = revert_sync_batchnorm(model) + + if device == 'cuda': + if not torch.cuda.is_available(): + return unittest.skip('test requires GPU and torch+cuda') + model = model.cuda() + + packed_inputs = get_packed_inputs(2, num_keypoints=133) + model.eval() + with torch.no_grad(): + data = model.data_preprocessor(packed_inputs, training=True) + batch_results = model.forward(**data, mode='predict') + self.assertEqual(len(batch_results), 2) + self.assertIsInstance(batch_results[0], PoseDataSample) + + @parameterized.expand(configs_with_devices) + def test_forward_tensor(self, config, devices): + dis_cfg = get_pose_estimator_cfg(config) + model_cfg = get_pose_estimator_cfg(dis_cfg.student_cfg) + model_cfg.backbone.init_cfg = None + + from mmpose.models import build_pose_estimator + + for device in devices: + model = build_pose_estimator(model_cfg) + model = revert_sync_batchnorm(model) + + if device == 'cuda': + if not torch.cuda.is_available(): + return unittest.skip('test requires GPU and torch+cuda') + model = model.cuda() + + packed_inputs = get_packed_inputs(2, num_keypoints=133) + data = model.data_preprocessor(packed_inputs, training=True) + batch_results = model.forward(**data, mode='tensor') + self.assertIsInstance(batch_results, (tuple, torch.Tensor)) diff --git a/tools/misc/pth_transfer.py b/tools/misc/pth_transfer.py new file mode 100644 index 0000000000..ee08fee748 --- /dev/null +++ b/tools/misc/pth_transfer.py @@ -0,0 +1,46 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from collections import OrderedDict + +import torch + + +def change_model(args): + dis_model = torch.load(args.dis_path) + all_name = [] + if args.two_dis: + for name, v in dis_model['state_dict'].items(): + if name.startswith('teacher.backbone'): + all_name.append((name[8:], v)) + elif name.startswith('distill_losses.loss_mgd.down'): + all_name.append(('head.' + name[24:], v)) + elif name.startswith('student.head'): + all_name.append((name[8:], v)) + else: + continue + else: + for name, v in dis_model['state_dict'].items(): + if name.startswith('student.'): + all_name.append((name[8:], v)) + else: + continue + state_dict = OrderedDict(all_name) + dis_model['state_dict'] = state_dict + + save_keys = ['meta', 'state_dict'] + ckpt_keys = list(dis_model.keys()) + for k in ckpt_keys: + if k not in save_keys: + dis_model.pop(k, None) + + torch.save(dis_model, args.output_path) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Transfer CKPT') + parser.add_argument('dis_path', help='dis_model path') + parser.add_argument('output_path', help='output path') + parser.add_argument( + '--two_dis', action='store_true', default=False, help='if two dis') + args = parser.parse_args() + change_model(args)