diff --git a/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco_ins.py new file mode 100644 index 00000000000..ce2a54c307c --- /dev/null +++ b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco_ins.py @@ -0,0 +1,7 @@ +_base_ = ['./mask2former_r50_lsj_8x2_50e_coco_ins.py'] + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco_ins.py new file mode 100644 index 00000000000..eb42af012fc --- /dev/null +++ b/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco_ins.py @@ -0,0 +1,5 @@ +_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth' # noqa + +model = dict( + backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=pretrained))) diff --git a/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py new file mode 100644 index 00000000000..5d7cc63887a --- /dev/null +++ b/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py @@ -0,0 +1,42 @@ +_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384.pth' # noqa + +depths = [2, 2, 18, 2] +model = dict( + backbone=dict( + pretrain_img_size=384, + embed_dims=128, + depths=depths, + num_heads=[4, 8, 16, 32], + window_size=12, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict(in_channels=[128, 256, 512, 1024])) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optimizer = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco_ins.py b/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco_ins.py new file mode 100644 index 00000000000..30d4d736081 --- /dev/null +++ b/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco_ins.py @@ -0,0 +1,26 @@ +_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa + +model = dict( + backbone=dict( + embed_dims=192, + num_heads=[6, 12, 24, 48], + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict(num_queries=200, in_channels=[192, 384, 768, 1536])) + +data = dict(samples_per_gpu=1, workers_per_gpu=1) + +lr_config = dict(step=[655556, 710184]) + +max_iters = 737500 +runner = dict(type='IterBasedRunner', max_iters=max_iters) + +# Before 735001th iteration, we do evaluation every 5000 iterations. +# After 735000th iteration, we do evaluation every 737500 iterations, +# which means that we do evaluation at the end of training.' +interval = 5000 +dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] +evaluation = dict( + interval=interval, + dynamic_intervals=dynamic_intervals, + metric=['PQ', 'bbox', 'segm']) diff --git a/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_ins.py new file mode 100644 index 00000000000..f33ed9b90a4 --- /dev/null +++ b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_ins.py @@ -0,0 +1,37 @@ +_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth' # noqa + +depths = [2, 2, 18, 2] +model = dict( + backbone=dict( + depths=depths, init_cfg=dict(type='Pretrained', + checkpoint=pretrained))) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optimizer = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))