From 2b5487350a9c5269ad6866536f5c52558ed9c420 Mon Sep 17 00:00:00 2001
From: Wencheng Wu <41542251+274869388@users.noreply.github.com>
Date: Tue, 1 Mar 2022 20:55:31 +0800
Subject: [PATCH 01/42] [Fix] Adjust the order of get_classes and FileClient.
 (#7276)

---
 mmdet/datasets/custom.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mmdet/datasets/custom.py b/mmdet/datasets/custom.py
index e449150abce..1cda41793fa 100644
--- a/mmdet/datasets/custom.py
+++ b/mmdet/datasets/custom.py
@@ -74,8 +74,8 @@ def __init__(self,
         self.proposal_file = proposal_file
         self.test_mode = test_mode
         self.filter_empty_gt = filter_empty_gt
-        self.CLASSES = self.get_classes(classes)
         self.file_client = mmcv.FileClient(**file_client_args)
+        self.CLASSES = self.get_classes(classes)
 
         # join paths if data_root is specified
         if self.data_root is not None:

From bf5afe5a78eab8fa42bff9807e02b8e52a94217e Mon Sep 17 00:00:00 2001
From: Yue Zhou <592267829@qq.com>
Date: Tue, 1 Mar 2022 20:59:00 +0800
Subject: [PATCH 02/42] delete -sv (#7277)

Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com>
---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 01fdf800fc9..f76428e79d6 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -265,7 +265,7 @@ jobs:
       - name: Build and install
         run: pip install -e .
       - name: Run unittests
-        run: coverage run --branch --source mmdet -m pytest tests -sv
+        run: coverage run --branch --source mmdet -m pytest tests
       - name: Generate coverage report
         run: |
           coverage xml

From 58714f0e38344b97368f623f84871b4682ec7dae Mon Sep 17 00:00:00 2001
From: "MingJian.L" <45811724+matrixgame2018@users.noreply.github.com>
Date: Thu, 3 Mar 2022 14:22:20 +0800
Subject: [PATCH 03/42] [Docs] Add Chinese version of finetune (#7178)

* [Fix] Fix wrong img name in onnx2tensorrt.py (#7157)

* [Docs] fix albumentations installed way (#7143)

* Update finetune.md

Translate the finetune.md doc to Chinese

* Update finetune.md

* Update finetune.md

* Update finetune.md

* fix lint

* fx lint

* fix pr

Co-authored-by: Jamie <jamiechoi1995@users.noreply.github.com>
Co-authored-by: BigDong <yudongwang@tju.edu.cn>
---
 docs/zh_cn/tutorials/finetune.md | 83 ++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/docs/zh_cn/tutorials/finetune.md b/docs/zh_cn/tutorials/finetune.md
index 72792e04307..e2318187194 100644
--- a/docs/zh_cn/tutorials/finetune.md
+++ b/docs/zh_cn/tutorials/finetune.md
@@ -1 +1,84 @@
 # 教程 7: 模型微调
+
+在 COCO 数据集上预训练的检测器可以作为其他数据集（例如 CityScapes 和 KITTI 数据集）优质的预训练模型。
+本教程将指导用户如何把 [ModelZoo](../model_zoo.md) 中提供的模型用于其他数据集中并使得当前所训练的模型获得更好性能。
+
+以下是在新数据集中微调模型需要的两个步骤。
+
+- 按 [教程2：自定义数据集的方法](customize_dataset.md) 中的方法对新数据集添加支持中的方法对新数据集添加支持
+- 按照本教程中所讨论方法，修改配置信息
+
+接下来将会以 Cityscapes Dataset 上的微调过程作为例子，具体讲述用户需要在配置中修改的五个部分。
+
+## 继承基础配置
+
+为了减轻编写整个配置的负担并减少漏洞的数量， MMDetection V2.0 支持从多个现有配置中继承配置信息。微调 MaskRCNN 模型的时候，新的配置信息需要使用从 `_base_/models/mask_rcnn_r50_fpn.py`中继承的配置信息来构建模型的基本结构。当使用 Cityscapes 数据集时，新的配置信息可以简便地从`_base_/datasets/cityscapes_instance.py`中继承。对于训练过程的运行设置部分，新配置需要从 `_base_/default_runtime.py`中继承。这些配置文件`configs`的目录下，用户可以选择全部内容的重新编写而不是使用继承方法。
+
+```python
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/cityscapes_instance.py', '../_base_/default_runtime.py'
+]
+```
+
+
+##  Head 的修改
+接下来新的配置还需要根据新数据集的类别数量对 Head 进行修改。只需要对 roi_head 中的 `num_classes`进行修改。修改后除了最后的预测模型的 Head 之外，预训练模型的权重的大部分都会被重新使用。
+
+```python
+model = dict(
+    pretrained=None,
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=8,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=8,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))))
+```
+
+## 数据集的修改
+用户可能还需要准备数据集并编写有关数据集的配置。目前 MMDetection V2.0 的配置文件已经支持 VOC、WIDER FACE、COCO 和 Cityscapes Dataset 的数据集信息。
+
+## 训练策略的修改
+微调超参数与默认的训练策略不同。它通常需要更小的学习率和更少的训练回合。
+
+```python
+# 优化器
+# batch size 为 8 时的 lr 配置
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# 学习策略
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[7])
+# lr_config 中的 max_epochs 和 step 需要针对自定义数据集进行专门调整
+runner = dict(max_epochs=8)
+log_config = dict(interval=100)
+```
+
+## 使用预训练模型
+
+如果要使用预训练模型时，可以在 `load_from` 中查阅新的配置信息，用户需要在训练开始之前下载好需要的模型权重，从而避免在训练过程中浪费了宝贵时间。
+```python
+load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth'  # noqa
+```

From 2c475fec3717df04b78b7f2cbe6002737dcbb303 Mon Sep 17 00:00:00 2001
From: Cedric Luo <26483343+chhluo@users.noreply.github.com>
Date: Sun, 6 Mar 2022 17:36:35 +0800
Subject: [PATCH 04/42] set unmap_results=True in ssd_head (#7328)

---
 mmdet/models/dense_heads/ssd_head.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mmdet/models/dense_heads/ssd_head.py b/mmdet/models/dense_heads/ssd_head.py
index ee773455d1f..e362fd8016a 100644
--- a/mmdet/models/dense_heads/ssd_head.py
+++ b/mmdet/models/dense_heads/ssd_head.py
@@ -316,7 +316,7 @@ def loss(self,
             gt_bboxes_ignore_list=gt_bboxes_ignore,
             gt_labels_list=gt_labels,
             label_channels=1,
-            unmap_outputs=False)
+            unmap_outputs=True)
         if cls_reg_targets is None:
             return None
         (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,

From 8d7da432af02a52bc5330b30997984335d0930a4 Mon Sep 17 00:00:00 2001
From: Yosuke Shinya <42844407+shinya7y@users.noreply.github.com>
Date: Sun, 6 Mar 2022 18:57:57 +0900
Subject: [PATCH 05/42] Update YOLOX log for non square input (#7235)

---
 configs/yolox/yolox_s_8x8_300e_coco.py    |  2 +-
 configs/yolox/yolox_tiny_8x8_300e_coco.py |  2 +-
 mmdet/datasets/pipelines/transforms.py    | 20 ++++++----
 mmdet/models/detectors/yolox.py           |  6 ++-
 mmdet/utils/__init__.py                   |  4 +-
 mmdet/utils/logger.py                     | 45 ++++++++++++++++++++++
 tests/test_utils/test_logger.py           | 47 +++++++++++++++++++++++
 7 files changed, 112 insertions(+), 14 deletions(-)
 create mode 100644 tests/test_utils/test_logger.py

diff --git a/configs/yolox/yolox_s_8x8_300e_coco.py b/configs/yolox/yolox_s_8x8_300e_coco.py
index cc730513072..2dc80882fcd 100644
--- a/configs/yolox/yolox_s_8x8_300e_coco.py
+++ b/configs/yolox/yolox_s_8x8_300e_coco.py
@@ -1,6 +1,6 @@
 _base_ = ['../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py']
 
-img_scale = (640, 640)
+img_scale = (640, 640)  # height, width
 
 # model settings
 model = dict(
diff --git a/configs/yolox/yolox_tiny_8x8_300e_coco.py b/configs/yolox/yolox_tiny_8x8_300e_coco.py
index 3aee99ab0a6..216fbc866dd 100644
--- a/configs/yolox/yolox_tiny_8x8_300e_coco.py
+++ b/configs/yolox/yolox_tiny_8x8_300e_coco.py
@@ -7,7 +7,7 @@
     neck=dict(in_channels=[96, 192, 384], out_channels=96),
     bbox_head=dict(in_channels=96, feat_channels=96))
 
-img_scale = (640, 640)
+img_scale = (640, 640)  # height, width
 
 train_pipeline = [
     dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
diff --git a/mmdet/datasets/pipelines/transforms.py b/mmdet/datasets/pipelines/transforms.py
index fb51922886f..15f14779c46 100644
--- a/mmdet/datasets/pipelines/transforms.py
+++ b/mmdet/datasets/pipelines/transforms.py
@@ -11,6 +11,7 @@
 
 from mmdet.core import PolygonMasks, find_inside_bboxes
 from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+from mmdet.utils import log_img_scale
 from ..builder import PIPELINES
 
 try:
@@ -1979,9 +1980,10 @@ class Mosaic:
 
     Args:
         img_scale (Sequence[int]): Image size after mosaic pipeline of single
-           image. Default to (640, 640).
+            image. The shape order should be (height, width).
+            Default to (640, 640).
         center_ratio_range (Sequence[float]): Center ratio range of mosaic
-           output. Default to (0.5, 1.5).
+            output. Default to (0.5, 1.5).
         min_bbox_size (int | float): The minimum pixel for filtering
             invalid bboxes after the mosaic pipeline. Default to 0.
         bbox_clip_border (bool, optional): Whether to clip the objects outside
@@ -2002,6 +2004,7 @@ def __init__(self,
                  skip_filter=True,
                  pad_val=114):
         assert isinstance(img_scale, tuple)
+        log_img_scale(img_scale, skip_square=True)
         self.img_scale = img_scale
         self.center_ratio_range = center_ratio_range
         self.min_bbox_size = min_bbox_size
@@ -2232,7 +2235,7 @@ class MixUp:
                 |             pad              |
                 +------------------------------+
 
-     The mixup transform steps are as follows::
+     The mixup transform steps are as follows:
 
         1. Another random image is picked by dataset and embedded in
            the top left patch(after padding and resizing)
@@ -2241,15 +2244,15 @@ class MixUp:
 
     Args:
         img_scale (Sequence[int]): Image output size after mixup pipeline.
-           Default: (640, 640).
+            The shape order should be (height, width). Default: (640, 640).
         ratio_range (Sequence[float]): Scale ratio of mixup image.
-           Default: (0.5, 1.5).
+            Default: (0.5, 1.5).
         flip_ratio (float): Horizontal flip ratio of mixup image.
-           Default: 0.5.
+            Default: 0.5.
         pad_val (int): Pad value. Default: 114.
         max_iters (int): The maximum number of iterations. If the number of
-           iterations is greater than `max_iters`, but gt_bbox is still
-           empty, then the iteration is terminated. Default: 15.
+            iterations is greater than `max_iters`, but gt_bbox is still
+            empty, then the iteration is terminated. Default: 15.
         min_bbox_size (float): Width and height threshold to filter bboxes.
             If the height or width of a box is smaller than this value, it
             will be removed. Default: 5.
@@ -2281,6 +2284,7 @@ def __init__(self,
                  bbox_clip_border=True,
                  skip_filter=True):
         assert isinstance(img_scale, tuple)
+        log_img_scale(img_scale, skip_square=True)
         self.dynamic_scale = img_scale
         self.ratio_range = ratio_range
         self.flip_ratio = flip_ratio
diff --git a/mmdet/models/detectors/yolox.py b/mmdet/models/detectors/yolox.py
index 2aba93f68cf..d26dc7349d8 100644
--- a/mmdet/models/detectors/yolox.py
+++ b/mmdet/models/detectors/yolox.py
@@ -6,6 +6,7 @@
 import torch.nn.functional as F
 from mmcv.runner import get_dist_info
 
+from ...utils import log_img_scale
 from ..builder import DETECTORS
 from .single_stage import SingleStageDetector
 
@@ -29,8 +30,8 @@ class YOLOX(SingleStageDetector):
             of YOLOX. Default: None.
         pretrained (str, optional): model pretrained path.
             Default: None.
-        input_size (tuple): The model default input image size.
-            Default: (640, 640).
+        input_size (tuple): The model default input image size. The shape
+            order should be (height, width). Default: (640, 640).
         size_multiplier (int): Image size multiplication factor.
             Default: 32.
         random_size_range (tuple): The multi-scale random range during
@@ -56,6 +57,7 @@ def __init__(self,
                  init_cfg=None):
         super(YOLOX, self).__init__(backbone, neck, bbox_head, train_cfg,
                                     test_cfg, pretrained, init_cfg)
+        log_img_scale(input_size, skip_square=True)
         self.rank, self.world_size = get_dist_info()
         self._default_input_size = input_size
         self._input_size = input_size
diff --git a/mmdet/utils/__init__.py b/mmdet/utils/__init__.py
index 4bd1019e9d5..3873ec09c67 100644
--- a/mmdet/utils/__init__.py
+++ b/mmdet/utils/__init__.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .collect_env import collect_env
-from .logger import get_root_logger
+from .logger import get_caller_name, get_root_logger, log_img_scale
 from .misc import find_latest_checkpoint
 from .setup_env import setup_multi_processes
 
 __all__ = [
     'get_root_logger', 'collect_env', 'find_latest_checkpoint',
-    'setup_multi_processes'
+    'setup_multi_processes', 'get_caller_name', 'log_img_scale'
 ]
diff --git a/mmdet/utils/logger.py b/mmdet/utils/logger.py
index 7e66fb6bfff..485f641b709 100644
--- a/mmdet/utils/logger.py
+++ b/mmdet/utils/logger.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import inspect
 import logging
 
 from mmcv.utils import get_logger
@@ -18,3 +19,47 @@ def get_root_logger(log_file=None, log_level=logging.INFO):
     logger = get_logger(name='mmdet', log_file=log_file, log_level=log_level)
 
     return logger
+
+
+def get_caller_name():
+    """Get name of caller method."""
+    # this_func_frame = inspect.stack()[0][0]  # i.e., get_caller_name
+    # callee_frame = inspect.stack()[1][0]  # e.g., log_img_scale
+    caller_frame = inspect.stack()[2][0]  # e.g., caller of log_img_scale
+    caller_method = caller_frame.f_code.co_name
+    try:
+        caller_class = caller_frame.f_locals['self'].__class__.__name__
+        return f'{caller_class}.{caller_method}'
+    except KeyError:  # caller is a function
+        return caller_method
+
+
+def log_img_scale(img_scale, shape_order='hw', skip_square=False):
+    """Log image size.
+
+    Args:
+        img_scale (tuple): Image size to be logged.
+        shape_order (str, optional): The order of image shape.
+            'hw' for (height, width) and 'wh' for (width, height).
+            Defaults to 'hw'.
+        skip_square (bool, optional): Whether to skip logging for square
+            img_scale. Defaults to False.
+
+    Returns:
+        bool: Whether to have done logging.
+    """
+    if shape_order == 'hw':
+        height, width = img_scale
+    elif shape_order == 'wh':
+        width, height = img_scale
+    else:
+        raise ValueError(f'Invalid shape_order {shape_order}.')
+
+    if skip_square and (height == width):
+        return False
+
+    logger = get_root_logger()
+    caller = get_caller_name()
+    logger.info(f'image shape: height={height}, width={width} in {caller}')
+
+    return True
diff --git a/tests/test_utils/test_logger.py b/tests/test_utils/test_logger.py
new file mode 100644
index 00000000000..900d6b615bf
--- /dev/null
+++ b/tests/test_utils/test_logger.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+
+from mmdet.utils import get_caller_name, log_img_scale
+
+
+def callee_func():
+    caller_name = get_caller_name()
+    return caller_name
+
+
+class CallerClassForTest:
+
+    def __init__(self):
+        self.caller_name = callee_func()
+
+
+def test_get_caller_name():
+    # test the case that caller is a function
+    caller_name = callee_func()
+    assert caller_name == 'test_get_caller_name'
+
+    # test the case that caller is a method in a class
+    caller_class = CallerClassForTest()
+    assert caller_class.caller_name == 'CallerClassForTest.__init__'
+
+
+def test_log_img_scale():
+    img_scale = (800, 1333)
+    done_logging = log_img_scale(img_scale)
+    assert done_logging
+
+    img_scale = (1333, 800)
+    done_logging = log_img_scale(img_scale, shape_order='wh')
+    assert done_logging
+
+    with pytest.raises(ValueError):
+        img_scale = (1333, 800)
+        done_logging = log_img_scale(img_scale, shape_order='xywh')
+
+    img_scale = (640, 640)
+    done_logging = log_img_scale(img_scale, skip_square=False)
+    assert done_logging
+
+    img_scale = (640, 640)
+    done_logging = log_img_scale(img_scale, skip_square=True)
+    assert not done_logging

From 2fc25f1d0ee3e03eb7dc2846dae3edd21bad7467 Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Mon, 7 Mar 2022 11:39:20 +0800
Subject: [PATCH 06/42] [Enhance] add cpu_num in cocopanoptic for pq computing
 (#7315)

* add cpu_num in cocopanoptic for pq computing

* cpu_num -> nproc

* move nproc to evaluate
---
 .../api_wrappers/panoptic_evaluation.py       |  9 ++++++--
 mmdet/datasets/coco_panoptic.py               | 22 +++++++++++++------
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/mmdet/datasets/api_wrappers/panoptic_evaluation.py b/mmdet/datasets/api_wrappers/panoptic_evaluation.py
index 9b34201e1c5..49850e5d52b 100644
--- a/mmdet/datasets/api_wrappers/panoptic_evaluation.py
+++ b/mmdet/datasets/api_wrappers/panoptic_evaluation.py
@@ -170,7 +170,8 @@ def pq_compute_multi_core(matched_annotations_list,
                           gt_folder,
                           pred_folder,
                           categories,
-                          file_client=None):
+                          file_client=None,
+                          nproc=32):
     """Evaluate the metrics of Panoptic Segmentation with multithreading.
 
     Same as the function with the same name in `panopticapi`.
@@ -184,6 +185,9 @@ def pq_compute_multi_core(matched_annotations_list,
         categories (str): The categories of the dataset.
         file_client (object): The file client of the dataset. If None,
             the backend will be set to `disk`.
+        nproc (int): Number of processes for panoptic quality computing.
+            Defaults to 32. When `nproc` exceeds the number of cpu cores,
+            the number of cpu cores is used.
     """
     if PQStat is None:
         raise RuntimeError(
@@ -195,7 +199,8 @@ def pq_compute_multi_core(matched_annotations_list,
         file_client_args = dict(backend='disk')
         file_client = mmcv.FileClient(**file_client_args)
 
-    cpu_num = multiprocessing.cpu_count()
+    cpu_num = min(nproc, multiprocessing.cpu_count())
+
     annotations_split = np.array_split(matched_annotations_list, cpu_num)
     print('Number of cores: {}, images per core: {}'.format(
         cpu_num, len(annotations_split[0])))
diff --git a/mmdet/datasets/coco_panoptic.py b/mmdet/datasets/coco_panoptic.py
index cc00c13fcad..91d7ecd8b47 100644
--- a/mmdet/datasets/coco_panoptic.py
+++ b/mmdet/datasets/coco_panoptic.py
@@ -426,7 +426,8 @@ def evaluate_pan_json(self,
                           result_files,
                           outfile_prefix,
                           logger=None,
-                          classwise=False):
+                          classwise=False,
+                          nproc=32):
         """Evaluate PQ according to the panoptic results json file."""
         imgs = self.coco.imgs
         gt_json = self.coco.img_ann_map  # image to annotations
@@ -451,9 +452,13 @@ def evaluate_pan_json(self,
         gt_folder = self.seg_prefix
         pred_folder = os.path.join(os.path.dirname(outfile_prefix), 'panoptic')
 
-        pq_stat = pq_compute_multi_core(matched_annotations_list, gt_folder,
-                                        pred_folder, self.categories,
-                                        self.file_client)
+        pq_stat = pq_compute_multi_core(
+            matched_annotations_list,
+            gt_folder,
+            pred_folder,
+            self.categories,
+            self.file_client,
+            nproc=nproc)
 
         metrics = [('All', None), ('Things', True), ('Stuff', False)]
         pq_results = {}
@@ -480,6 +485,7 @@ def evaluate(self,
                  logger=None,
                  jsonfile_prefix=None,
                  classwise=False,
+                 nproc=32,
                  **kwargs):
         """Evaluation in COCO Panoptic protocol.
 
@@ -494,6 +500,9 @@ def evaluate(self,
                 If not specified, a temp file will be created. Default: None.
             classwise (bool): Whether to print classwise evaluation results.
                 Default: False.
+            nproc (int): Number of processes for panoptic quality computing.
+                Defaults to 32. When `nproc` exceeds the number of cpu cores,
+                the number of cpu cores is used.
 
         Returns:
             dict[str, float]: COCO Panoptic style evaluation metric.
@@ -512,9 +521,8 @@ def evaluate(self,
         outfile_prefix = os.path.join(tmp_dir.name, 'results') \
             if tmp_dir is not None else jsonfile_prefix
         if 'PQ' in metrics:
-            eval_pan_results = self.evaluate_pan_json(result_files,
-                                                      outfile_prefix, logger,
-                                                      classwise)
+            eval_pan_results = self.evaluate_pan_json(
+                result_files, outfile_prefix, logger, classwise, nproc=nproc)
             eval_results.update(eval_pan_results)
 
         if tmp_dir is not None:

From e76117bd5e9c20bc13bfc40cbed89397ae648e03 Mon Sep 17 00:00:00 2001
From: Jingwei Zhang <zjw18@mails.tsinghua.edu.cn>
Date: Wed, 9 Mar 2022 19:48:18 +0800
Subject: [PATCH 07/42] [Enhancement] Allow to set channel_order in
 LoadImageFromFile (#7258)

* allow to set channel_order when loading images

* fix lint

* fix unit test

* fix lint
---
 mmdet/datasets/pipelines/loading.py            | 6 +++++-
 tests/test_data/test_pipelines/test_loading.py | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/mmdet/datasets/pipelines/loading.py b/mmdet/datasets/pipelines/loading.py
index fc68fc3d22f..735b5573f02 100644
--- a/mmdet/datasets/pipelines/loading.py
+++ b/mmdet/datasets/pipelines/loading.py
@@ -37,9 +37,11 @@ class LoadImageFromFile:
     def __init__(self,
                  to_float32=False,
                  color_type='color',
+                 channel_order='bgr',
                  file_client_args=dict(backend='disk')):
         self.to_float32 = to_float32
         self.color_type = color_type
+        self.channel_order = channel_order
         self.file_client_args = file_client_args.copy()
         self.file_client = None
 
@@ -63,7 +65,8 @@ def __call__(self, results):
             filename = results['img_info']['filename']
 
         img_bytes = self.file_client.get(filename)
-        img = mmcv.imfrombytes(img_bytes, flag=self.color_type)
+        img = mmcv.imfrombytes(
+            img_bytes, flag=self.color_type, channel_order=self.channel_order)
         if self.to_float32:
             img = img.astype(np.float32)
 
@@ -79,6 +82,7 @@ def __repr__(self):
         repr_str = (f'{self.__class__.__name__}('
                     f'to_float32={self.to_float32}, '
                     f"color_type='{self.color_type}', "
+                    f"channel_order='{self.channel_order}', "
                     f'file_client_args={self.file_client_args})')
         return repr_str
 
diff --git a/tests/test_data/test_pipelines/test_loading.py b/tests/test_data/test_pipelines/test_loading.py
index 760b09a0327..186d28db85e 100644
--- a/tests/test_data/test_pipelines/test_loading.py
+++ b/tests/test_data/test_pipelines/test_loading.py
@@ -27,7 +27,7 @@ def test_load_img(self):
         assert results['img_shape'] == (288, 512, 3)
         assert results['ori_shape'] == (288, 512, 3)
         assert repr(transform) == transform.__class__.__name__ + \
-            "(to_float32=False, color_type='color', " + \
+            "(to_float32=False, color_type='color', channel_order='bgr', " + \
             "file_client_args={'backend': 'disk'})"
 
         # no img_prefix

From 62feea59e8a6d45c0de13bc9b592646e1e5cb4ca Mon Sep 17 00:00:00 2001
From: jbwang1997 <jbwang1997@gmail.com>
Date: Wed, 9 Mar 2022 19:50:36 +0800
Subject: [PATCH 08/42] [Fix] Force the inputs of `get_bboxes` in yolox_head to
 float32. (#7324)

* Fix softnms bug

* Add force_fp32 in corner_head and centripetal_head
---
 mmdet/models/dense_heads/centripetal_head.py | 3 +++
 mmdet/models/dense_heads/corner_head.py      | 5 ++++-
 mmdet/models/dense_heads/yolox_head.py       | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/mmdet/models/dense_heads/centripetal_head.py b/mmdet/models/dense_heads/centripetal_head.py
index fe85794e87a..ebc721b7623 100644
--- a/mmdet/models/dense_heads/centripetal_head.py
+++ b/mmdet/models/dense_heads/centripetal_head.py
@@ -2,6 +2,7 @@
 import torch.nn as nn
 from mmcv.cnn import ConvModule, normal_init
 from mmcv.ops import DeformConv2d
+from mmcv.runner import force_fp32
 
 from mmdet.core import multi_apply
 from ..builder import HEADS, build_loss
@@ -203,6 +204,7 @@ def forward_single(self, x, lvl_ind):
         ]
         return result_list
 
+    @force_fp32()
     def loss(self,
              tl_heats,
              br_heats,
@@ -361,6 +363,7 @@ def loss_single(self, tl_hmp, br_hmp, tl_off, br_off, tl_guiding_shift,
 
         return det_loss, off_loss, guiding_loss, centripetal_loss
 
+    @force_fp32()
     def get_bboxes(self,
                    tl_heats,
                    br_heats,
diff --git a/mmdet/models/dense_heads/corner_head.py b/mmdet/models/dense_heads/corner_head.py
index 327094bad67..c6a2866f94a 100644
--- a/mmdet/models/dense_heads/corner_head.py
+++ b/mmdet/models/dense_heads/corner_head.py
@@ -6,7 +6,7 @@
 import torch.nn as nn
 from mmcv.cnn import ConvModule, bias_init_with_prob
 from mmcv.ops import CornerPool, batched_nms
-from mmcv.runner import BaseModule
+from mmcv.runner import BaseModule, force_fp32
 
 from mmdet.core import multi_apply
 from ..builder import HEADS, build_loss
@@ -152,6 +152,7 @@ def __init__(self,
         self.train_cfg = train_cfg
         self.test_cfg = test_cfg
 
+        self.fp16_enabled = False
         self._init_layers()
 
     def _make_layers(self, out_channels, in_channels=256, feat_channels=256):
@@ -509,6 +510,7 @@ def get_targets(self,
 
         return target_result
 
+    @force_fp32()
     def loss(self,
              tl_heats,
              br_heats,
@@ -649,6 +651,7 @@ def loss_single(self, tl_hmp, br_hmp, tl_emb, br_emb, tl_off, br_off,
 
         return det_loss, pull_loss, push_loss, off_loss
 
+    @force_fp32()
     def get_bboxes(self,
                    tl_heats,
                    br_heats,
diff --git a/mmdet/models/dense_heads/yolox_head.py b/mmdet/models/dense_heads/yolox_head.py
index a1811c9415d..de3f93ccd36 100644
--- a/mmdet/models/dense_heads/yolox_head.py
+++ b/mmdet/models/dense_heads/yolox_head.py
@@ -212,6 +212,7 @@ def forward(self, feats):
                            self.multi_level_conv_reg,
                            self.multi_level_conv_obj)
 
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'objectnesses'))
     def get_bboxes(self,
                    cls_scores,
                    bbox_preds,

From ab16260c5111882bf111bde7b044c0cd76076904 Mon Sep 17 00:00:00 2001
From: Xiangxu-0103 <xuxiang0103@gmail.com>
Date: Wed, 9 Mar 2022 19:51:10 +0800
Subject: [PATCH 09/42] [Fix] Fix typo in FPN neck (#7347)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update readme and pretrained related (#7301)

* [Docs] Add Chinese version of onnx2tensorrt.md (#7219)

* Fix bug of docs

* translate onnx2tensorrt.md

* fix

* fix end-of-file-fixer

* fix some bugs

* 修复链接跳转

* 修复链接跳转

* 修复链接跳转-测试1

* 修复链接跳转-测试2

* 修复链接跳转-测试2

* 修复链接跳转-测试3

* 修复链接跳转-测试5

* Fix

Co-authored-by: jbwang1997 <jbwang1997@gmail.com>

* Update useful_tools.md (#7180)

* [Enhancement]: Update colab tutorials (#7310)

* update colab tutorials

* update

* fix

* fix wrong CUDA explaination

* resolve comments

* resolve comments

* fix typo

Co-authored-by: Cedric Luo <luochunhua1996@outlook.com>
Co-authored-by: tripleMu <92794867+q3394101@users.noreply.github.com>
Co-authored-by: jbwang1997 <jbwang1997@gmail.com>
Co-authored-by: kira <39787375+yangrisheng@users.noreply.github.com>
Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com>
---
 mmdet/models/necks/fpn.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mmdet/models/necks/fpn.py b/mmdet/models/necks/fpn.py
index 9f601386549..f57b8effe65 100644
--- a/mmdet/models/necks/fpn.py
+++ b/mmdet/models/necks/fpn.py
@@ -15,8 +15,8 @@ class FPN(BaseModule):
     Detection <https://arxiv.org/abs/1612.03144>`_.
 
     Args:
-        in_channels (List[int]): Number of input channels per scale.
-        out_channels (int): Number of output channels (used at each scale)
+        in_channels (list[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
         num_outs (int): Number of output scales.
         start_level (int): Index of the start input backbone level used to
             build the feature pyramid. Default: 0.
@@ -29,7 +29,7 @@ class FPN(BaseModule):
             Only the following options are allowed
 
             - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
-            - 'on_lateral':  Last feature map after lateral convs.
+            - 'on_lateral': Last feature map after lateral convs.
             - 'on_output': The last output feature map after fpn convs.
         relu_before_extra_convs (bool): Whether to apply relu before the extra
             conv. Default: False.
@@ -37,10 +37,10 @@ class FPN(BaseModule):
             Default: False.
         conv_cfg (dict): Config dict for convolution layer. Default: None.
         norm_cfg (dict): Config dict for normalization layer. Default: None.
-        act_cfg (str): Config dict for activation layer in ConvModule.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
             Default: None.
         upsample_cfg (dict): Config dict for interpolate layer.
-            Default: `dict(mode='nearest')`
+            Default: dict(mode='nearest').
         init_cfg (dict or list[dict], optional): Initialization config dict.
 
     Example:

From c546b5044098b71d59a139036a87c5c97bcab4e2 Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Mon, 14 Mar 2022 19:06:54 +0800
Subject: [PATCH 10/42] [Fix] fix misplaced arguments in
 LoadPanopticAnnotations (#7388)

---
 mmdet/datasets/pipelines/loading.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/mmdet/datasets/pipelines/loading.py b/mmdet/datasets/pipelines/loading.py
index 735b5573f02..b2e4131b028 100644
--- a/mmdet/datasets/pipelines/loading.py
+++ b/mmdet/datasets/pipelines/loading.py
@@ -442,9 +442,14 @@ def __init__(self,
                 'pip install git+https://github.com/cocodataset/'
                 'panopticapi.git.')
 
-        super(LoadPanopticAnnotations,
-              self).__init__(with_bbox, with_label, with_mask, with_seg, True,
-                             file_client_args)
+        super(LoadPanopticAnnotations, self).__init__(
+            with_bbox=with_bbox,
+            with_label=with_label,
+            with_mask=with_mask,
+            with_seg=with_seg,
+            poly2mask=True,
+            denorm_bbox=False,
+            file_client_args=file_client_args)
 
     def _load_masks_and_semantic_segs(self, results):
         """Private function to load mask and semantic segmentation annotations.

From c576e5d570bf64a99e2c6817ed7b5c0084a44a55 Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Wed, 16 Mar 2022 21:57:36 +0800
Subject: [PATCH 11/42] [Enhance] Take point sample related functions out of
 mask_point_head (#7353)

add point sample

replace function in mask_point_head
---
 .../roi_heads/mask_heads/mask_point_head.py   | 62 +------------
 mmdet/models/utils/__init__.py                |  4 +-
 mmdet/models/utils/point_sample.py            | 87 +++++++++++++++++++
 3 files changed, 94 insertions(+), 59 deletions(-)
 create mode 100644 mmdet/models/utils/point_sample.py

diff --git a/mmdet/models/roi_heads/mask_heads/mask_point_head.py b/mmdet/models/roi_heads/mask_heads/mask_point_head.py
index 120b8ffa2ba..c022f1fdbc7 100644
--- a/mmdet/models/roi_heads/mask_heads/mask_point_head.py
+++ b/mmdet/models/roi_heads/mask_heads/mask_point_head.py
@@ -8,6 +8,7 @@
 from mmcv.runner import BaseModule
 
 from mmdet.models.builder import HEADS, build_loss
+from mmdet.models.utils import get_uncertain_point_coords_with_randomness
 
 
 @HEADS.register_module()
@@ -185,31 +186,6 @@ def loss(self, point_pred, point_targets, labels):
         loss['loss_point'] = loss_point
         return loss
 
-    def _get_uncertainty(self, mask_pred, labels):
-        """Estimate uncertainty based on pred logits.
-
-        We estimate uncertainty as L1 distance between 0.0 and the logits
-        prediction in 'mask_pred' for the foreground class in `classes`.
-
-        Args:
-            mask_pred (Tensor): mask predication logits, shape (num_rois,
-                num_classes, mask_height, mask_width).
-
-            labels (list[Tensor]): Either predicted or ground truth label for
-                each predicted mask, of length num_rois.
-
-        Returns:
-            scores (Tensor): Uncertainty scores with the most uncertain
-                locations having the highest uncertainty score,
-                shape (num_rois, 1, mask_height, mask_width)
-        """
-        if mask_pred.shape[1] == 1:
-            gt_class_logits = mask_pred.clone()
-        else:
-            inds = torch.arange(mask_pred.shape[0], device=mask_pred.device)
-            gt_class_logits = mask_pred[inds, labels].unsqueeze(1)
-        return -torch.abs(gt_class_logits)
-
     def get_roi_rel_points_train(self, mask_pred, labels, cfg):
         """Get ``num_points`` most uncertain points with random points during
         train.
@@ -230,39 +206,9 @@ def get_roi_rel_points_train(self, mask_pred, labels, cfg):
             point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
                 that contains the coordinates sampled points.
         """
-        num_points = cfg.num_points
-        oversample_ratio = cfg.oversample_ratio
-        importance_sample_ratio = cfg.importance_sample_ratio
-        assert oversample_ratio >= 1
-        assert 0 <= importance_sample_ratio <= 1
-        batch_size = mask_pred.shape[0]
-        num_sampled = int(num_points * oversample_ratio)
-        point_coords = torch.rand(
-            batch_size, num_sampled, 2, device=mask_pred.device)
-        point_logits = point_sample(mask_pred, point_coords)
-        # It is crucial to calculate uncertainty based on the sampled
-        # prediction value for the points. Calculating uncertainties of the
-        # coarse predictions first and sampling them for points leads to
-        # incorrect results.  To illustrate this: assume uncertainty func(
-        # logits)=-abs(logits), a sampled point between two coarse
-        # predictions with -1 and 1 logits has 0 logits, and therefore 0
-        # uncertainty value. However, if we calculate uncertainties for the
-        # coarse predictions first, both will have -1 uncertainty,
-        # and sampled point will get -1 uncertainty.
-        point_uncertainties = self._get_uncertainty(point_logits, labels)
-        num_uncertain_points = int(importance_sample_ratio * num_points)
-        num_random_points = num_points - num_uncertain_points
-        idx = torch.topk(
-            point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
-        shift = num_sampled * torch.arange(
-            batch_size, dtype=torch.long, device=mask_pred.device)
-        idx += shift[:, None]
-        point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
-            batch_size, num_uncertain_points, 2)
-        if num_random_points > 0:
-            rand_roi_coords = torch.rand(
-                batch_size, num_random_points, 2, device=mask_pred.device)
-            point_coords = torch.cat((point_coords, rand_roi_coords), dim=1)
+        point_coords = get_uncertain_point_coords_with_randomness(
+            mask_pred, labels, cfg.num_points, cfg.oversample_ratio,
+            cfg.importance_sample_ratio)
         return point_coords
 
     def get_roi_rel_points_test(self, mask_pred, pred_label, cfg):
diff --git a/mmdet/models/utils/__init__.py b/mmdet/models/utils/__init__.py
index add5693b60c..6d9c4057a39 100644
--- a/mmdet/models/utils/__init__.py
+++ b/mmdet/models/utils/__init__.py
@@ -10,6 +10,7 @@
 from .misc import interpolate_as, sigmoid_geometric_mean
 from .normed_predictor import NormedConv2d, NormedLinear
 from .panoptic_gt_processing import preprocess_panoptic_gt
+from .point_sample import get_uncertain_point_coords_with_randomness
 from .positional_encoding import (LearnedPositionalEncoding,
                                   SinePositionalEncoding)
 from .res_layer import ResLayer, SimplifiedBasicBlock
@@ -27,5 +28,6 @@
     'SELayer', 'interpolate_as', 'ConvUpsample', 'CSPLayer',
     'adaptive_avg_pool2d', 'AdaptiveAvgPool2d', 'PatchEmbed', 'nchw_to_nlc',
     'nlc_to_nchw', 'pvt_convert', 'sigmoid_geometric_mean',
-    'preprocess_panoptic_gt', 'DyReLU'
+    'preprocess_panoptic_gt', 'DyReLU',
+    'get_uncertain_point_coords_with_randomness'
 ]
diff --git a/mmdet/models/utils/point_sample.py b/mmdet/models/utils/point_sample.py
new file mode 100644
index 00000000000..c2c3cf91cc9
--- /dev/null
+++ b/mmdet/models/utils/point_sample.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import point_sample
+
+
+def get_uncertainty(mask_pred, labels):
+    """Estimate uncertainty based on pred logits.
+
+    We estimate uncertainty as L1 distance between 0.0 and the logits
+    prediction in 'mask_pred' for the foreground class in `classes`.
+
+    Args:
+        mask_pred (Tensor): mask predication logits, shape (num_rois,
+            num_classes, mask_height, mask_width).
+
+        labels (list[Tensor]): Either predicted or ground truth label for
+            each predicted mask, of length num_rois.
+
+    Returns:
+        scores (Tensor): Uncertainty scores with the most uncertain
+            locations having the highest uncertainty score,
+            shape (num_rois, 1, mask_height, mask_width)
+    """
+    if mask_pred.shape[1] == 1:
+        gt_class_logits = mask_pred.clone()
+    else:
+        inds = torch.arange(mask_pred.shape[0], device=mask_pred.device)
+        gt_class_logits = mask_pred[inds, labels].unsqueeze(1)
+    return -torch.abs(gt_class_logits)
+
+
+def get_uncertain_point_coords_with_randomness(mask_pred, labels, num_points,
+                                               oversample_ratio,
+                                               importance_sample_ratio):
+    """Get ``num_points`` most uncertain points with random points during
+    train.
+
+    Sample points in [0, 1] x [0, 1] coordinate space based on their
+    uncertainty. The uncertainties are calculated for each point using
+    'get_uncertainty()' function that takes point's logit prediction as
+    input.
+
+    Args:
+        mask_pred (Tensor): A tensor of shape (num_rois, num_classes,
+            mask_height, mask_width) for class-specific or class-agnostic
+            prediction.
+        labels (list): The ground truth class for each instance.
+        num_points (int): The number of points to sample.
+        oversample_ratio (int): Oversampling parameter.
+        importance_sample_ratio (float): Ratio of points that are sampled
+            via importnace sampling.
+
+    Returns:
+        point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
+            that contains the coordinates sampled points.
+    """
+    assert oversample_ratio >= 1
+    assert 0 <= importance_sample_ratio <= 1
+    batch_size = mask_pred.shape[0]
+    num_sampled = int(num_points * oversample_ratio)
+    point_coords = torch.rand(
+        batch_size, num_sampled, 2, device=mask_pred.device)
+    point_logits = point_sample(mask_pred, point_coords)
+    # It is crucial to calculate uncertainty based on the sampled
+    # prediction value for the points. Calculating uncertainties of the
+    # coarse predictions first and sampling them for points leads to
+    # incorrect results.  To illustrate this: assume uncertainty func(
+    # logits)=-abs(logits), a sampled point between two coarse
+    # predictions with -1 and 1 logits has 0 logits, and therefore 0
+    # uncertainty value. However, if we calculate uncertainties for the
+    # coarse predictions first, both will have -1 uncertainty,
+    # and sampled point will get -1 uncertainty.
+    point_uncertainties = get_uncertainty(point_logits, labels)
+    num_uncertain_points = int(importance_sample_ratio * num_points)
+    num_random_points = num_points - num_uncertain_points
+    idx = torch.topk(
+        point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+    shift = num_sampled * torch.arange(
+        batch_size, dtype=torch.long, device=mask_pred.device)
+    idx += shift[:, None]
+    point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
+        batch_size, num_uncertain_points, 2)
+    if num_random_points > 0:
+        rand_roi_coords = torch.rand(
+            batch_size, num_random_points, 2, device=mask_pred.device)
+        point_coords = torch.cat((point_coords, rand_roi_coords), dim=1)
+    return point_coords

From eaf79b6199159c0b1aead6d02b92ee53b52ec064 Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Wed, 16 Mar 2022 21:58:45 +0800
Subject: [PATCH 12/42] [Enhance] Add instance evalutation for coco_panoptic
 (#7313)

update comments

rename function and replace condition

rename

add message for proposal_fast when instance segmentation evaluation

set cocoGt as arg

update comments

update comments

update docstring and rename

add unit test

update docstring

add assert for instance eval
---
 mmdet/apis/test.py                            |  14 +++
 mmdet/datasets/coco.py                        | 101 +++++++++++----
 mmdet/datasets/coco_panoptic.py               | 117 ++++++++++++++++--
 .../test_datasets/test_panoptic_dataset.py    | 117 ++++++++++++++++++
 4 files changed, 315 insertions(+), 34 deletions(-)

diff --git a/mmdet/apis/test.py b/mmdet/apis/test.py
index 39af63045b2..973d3623d6e 100644
--- a/mmdet/apis/test.py
+++ b/mmdet/apis/test.py
@@ -64,6 +64,13 @@ def single_gpu_test(model,
         if isinstance(result[0], tuple):
             result = [(bbox_results, encode_mask_results(mask_results))
                       for bbox_results, mask_results in result]
+        # This logic is only used in panoptic segmentation test.
+        elif isinstance(result[0], dict) and 'ins_results' in result[0]:
+            for j in range(len(result)):
+                bbox_results, mask_results = result[j]['ins_results']
+                result[j]['ins_results'] = (bbox_results,
+                                            encode_mask_results(mask_results))
+
         results.extend(result)
 
         for _ in range(batch_size):
@@ -104,6 +111,13 @@ def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
             if isinstance(result[0], tuple):
                 result = [(bbox_results, encode_mask_results(mask_results))
                           for bbox_results, mask_results in result]
+            # This logic is only used in panoptic segmentation test.
+            elif isinstance(result[0], dict) and 'ins_results' in result[0]:
+                for j in range(len(result)):
+                    bbox_results, mask_results = result[j]['ins_results']
+                    result[j]['ins_results'] = (
+                        bbox_results, encode_mask_results(mask_results))
+
         results.extend(result)
 
         if rank == 0:
diff --git a/mmdet/datasets/coco.py b/mmdet/datasets/coco.py
index efd69490a84..46e3a6cbdd6 100644
--- a/mmdet/datasets/coco.py
+++ b/mmdet/datasets/coco.py
@@ -383,19 +383,24 @@ def format_results(self, results, jsonfile_prefix=None, **kwargs):
         result_files = self.results2json(results, jsonfile_prefix)
         return result_files, tmp_dir
 
-    def evaluate(self,
-                 results,
-                 metric='bbox',
-                 logger=None,
-                 jsonfile_prefix=None,
-                 classwise=False,
-                 proposal_nums=(100, 300, 1000),
-                 iou_thrs=None,
-                 metric_items=None):
-        """Evaluation in COCO protocol.
+    def evaluate_det_segm(self,
+                          results,
+                          result_files,
+                          coco_gt,
+                          metrics,
+                          logger=None,
+                          classwise=False,
+                          proposal_nums=(100, 300, 1000),
+                          iou_thrs=None,
+                          metric_items=None):
+        """Instance segmentation and object detection evaluation in COCO
+        protocol.
 
         Args:
-            results (list[list | tuple]): Testing results of the dataset.
+            results (list[list | tuple | dict]): Testing results of the
+                dataset.
+            result_files (dict[str, str]): a dict contains json file path.
+            coco_gt (COCO): COCO API object with ground truth annotation.
             metric (str | list[str]): Metrics to be evaluated. Options are
                 'bbox', 'segm', 'proposal', 'proposal_fast'.
             logger (logging.Logger | str | None): Logger used for printing
@@ -422,12 +427,6 @@ def evaluate(self,
         Returns:
             dict[str, float]: COCO style evaluation metric.
         """
-
-        metrics = metric if isinstance(metric, list) else [metric]
-        allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast']
-        for metric in metrics:
-            if metric not in allowed_metrics:
-                raise KeyError(f'metric {metric} is not supported')
         if iou_thrs is None:
             iou_thrs = np.linspace(
                 .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
@@ -435,10 +434,7 @@ def evaluate(self,
             if not isinstance(metric_items, list):
                 metric_items = [metric_items]
 
-        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
-
         eval_results = OrderedDict()
-        cocoGt = self.coco
         for metric in metrics:
             msg = f'Evaluating {metric}...'
             if logger is None:
@@ -446,6 +442,9 @@ def evaluate(self,
             print_log(msg, logger=logger)
 
             if metric == 'proposal_fast':
+                if isinstance(results[0], tuple):
+                    raise KeyError('proposal_fast is not supported for '
+                                   'instance segmentation result.')
                 ar = self.fast_eval_recall(
                     results, proposal_nums, iou_thrs, logger='silent')
                 log_msg = []
@@ -476,7 +475,7 @@ def evaluate(self,
                         'of small/medium/large instances since v2.12.0. This '
                         'does not change the overall mAP calculation.',
                         UserWarning)
-                cocoDt = cocoGt.loadRes(predictions)
+                coco_det = coco_gt.loadRes(predictions)
             except IndexError:
                 print_log(
                     'The testing results of the whole dataset is empty.',
@@ -484,7 +483,7 @@ def evaluate(self,
                     level=logging.ERROR)
                 break
 
-            cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
+            cocoEval = COCOeval(coco_gt, coco_det, iou_type)
             cocoEval.params.catIds = self.cat_ids
             cocoEval.params.imgIds = self.img_ids
             cocoEval.params.maxDets = list(proposal_nums)
@@ -590,6 +589,64 @@ def evaluate(self,
                 eval_results[f'{metric}_mAP_copypaste'] = (
                     f'{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
                     f'{ap[4]:.3f} {ap[5]:.3f}')
+
+        return eval_results
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 classwise=False,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thrs=None,
+                 metric_items=None):
+        """Evaluation in COCO protocol.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. Options are
+                'bbox', 'segm', 'proposal', 'proposal_fast'.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            classwise (bool): Whether to evaluating the AP for each class.
+            proposal_nums (Sequence[int]): Proposal number used for evaluating
+                recalls, such as recall@100, recall@1000.
+                Default: (100, 300, 1000).
+            iou_thrs (Sequence[float], optional): IoU threshold used for
+                evaluating recalls/mAPs. If set to a list, the average of all
+                IoUs will also be computed. If not specified, [0.50, 0.55,
+                0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95] will be used.
+                Default: None.
+            metric_items (list[str] | str, optional): Metric items that will
+                be returned. If not specified, ``['AR@100', 'AR@300',
+                'AR@1000', 'AR_s@1000', 'AR_m@1000', 'AR_l@1000' ]`` will be
+                used when ``metric=='proposal'``, ``['mAP', 'mAP_50', 'mAP_75',
+                'mAP_s', 'mAP_m', 'mAP_l']`` will be used when
+                ``metric=='bbox' or metric=='segm'``.
+
+        Returns:
+            dict[str, float]: COCO style evaluation metric.
+        """
+
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        coco_gt = self.coco
+        self.cat_ids = coco_gt.get_cat_ids(cat_names=self.CLASSES)
+
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+        eval_results = self.evaluate_det_segm(results, result_files, coco_gt,
+                                              metrics, logger, classwise,
+                                              proposal_nums, iou_thrs,
+                                              metric_items)
+
         if tmp_dir is not None:
             tmp_dir.cleanup()
         return eval_results
diff --git a/mmdet/datasets/coco_panoptic.py b/mmdet/datasets/coco_panoptic.py
index 91d7ecd8b47..7afc077cc03 100644
--- a/mmdet/datasets/coco_panoptic.py
+++ b/mmdet/datasets/coco_panoptic.py
@@ -140,6 +140,29 @@ class CocoPanopticDataset(CocoDataset):
             },
             ...
         ]
+
+    Args:
+        ann_file (str): Panoptic segmentation annotation file path.
+        pipeline (list[dict]): Processing pipeline.
+        ins_ann_file (str): Instance segmentation annotation file path.
+            Defaults to None.
+        classes (str | Sequence[str], optional): Specify classes to load.
+            If is None, ``cls.CLASSES`` will be used. Defaults to None.
+        data_root (str, optional): Data root for ``ann_file``,
+            ``ins_ann_file`` ``img_prefix``, ``seg_prefix``, ``proposal_file``
+            if specified. Defaults to None.
+        img_prefix (str, optional): Prefix of path to images. Defaults to ''.
+        seg_prefix (str, optional): Prefix of path to segmentation files.
+            Defaults to None.
+        proposal_file (str, optional): Path to proposal file. Defaults to None.
+        test_mode (bool, optional): If set True, annotation will not be loaded.
+            Defaults to False.
+        filter_empty_gt (bool, optional): If set true, images without bounding
+            boxes of the dataset's classes will be filtered out. This option
+            only works when `test_mode=False`, i.e., we never filter images
+            during tests. Defaults to True.
+        file_client_args (:obj:`mmcv.ConfigDict` | dict): file client args.
+            Defaults to dict(backend='disk').
     """
     CLASSES = [
         'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
@@ -233,6 +256,31 @@ class CocoPanopticDataset(CocoDataset):
                (206, 186, 171), (152, 161, 64), (116, 112, 0), (0, 114, 143),
                (102, 102, 156), (250, 141, 255)]
 
+    def __init__(self,
+                 ann_file,
+                 pipeline,
+                 ins_ann_file=None,
+                 classes=None,
+                 data_root=None,
+                 img_prefix='',
+                 seg_prefix=None,
+                 proposal_file=None,
+                 test_mode=False,
+                 filter_empty_gt=True,
+                 file_client_args=dict(backend='disk')):
+        super().__init__(
+            ann_file,
+            pipeline,
+            classes=classes,
+            data_root=data_root,
+            img_prefix=img_prefix,
+            seg_prefix=seg_prefix,
+            proposal_file=proposal_file,
+            test_mode=test_mode,
+            filter_empty_gt=filter_empty_gt,
+            file_client_args=file_client_args)
+        self.ins_ann_file = ins_ann_file
+
     def load_annotations(self, ann_file):
         """Load annotation from COCO Panoptic style annotation file.
 
@@ -402,23 +450,41 @@ def _pan2json(self, results, outfile_prefix):
         return pan_json_results
 
     def results2json(self, results, outfile_prefix):
-        """Dump the panoptic results to a COCO panoptic style json file.
+        """Dump the results to a COCO style json file.
+
+        There are 4 types of results: proposals, bbox predictions, mask
+        predictions, panoptic segmentation predictions, and they have
+        different data types. This method will automatically recognize
+        the type, and dump them to json files.
 
         Args:
             results (dict): Testing results of the dataset.
             outfile_prefix (str): The filename prefix of the json files. If the
                 prefix is "somepath/xxx", the json files will be named
-                "somepath/xxx.panoptic.json"
+                "somepath/xxx.panoptic.json", "somepath/xxx.bbox.json",
+                "somepath/xxx.segm.json"
 
         Returns:
-            dict[str: str]: The key is 'panoptic' and the value is
-                corresponding filename.
+            dict[str: str]: Possible keys are "panoptic", "bbox", "segm", \
+                "proposal", and values are corresponding filenames.
         """
         result_files = dict()
-        pan_results = [result['pan_results'] for result in results]
-        pan_json_results = self._pan2json(pan_results, outfile_prefix)
-        result_files['panoptic'] = f'{outfile_prefix}.panoptic.json'
-        mmcv.dump(pan_json_results, result_files['panoptic'])
+        # panoptic segmentation results
+        if 'pan_results' in results[0]:
+            pan_results = [result['pan_results'] for result in results]
+            pan_json_results = self._pan2json(pan_results, outfile_prefix)
+            result_files['panoptic'] = f'{outfile_prefix}.panoptic.json'
+            mmcv.dump(pan_json_results, result_files['panoptic'])
+
+        # instance segmentation results
+        if 'ins_results' in results[0]:
+            ins_results = [result['ins_results'] for result in results]
+            bbox_json_results, segm_json_results = self._segm2json(ins_results)
+            result_files['bbox'] = f'{outfile_prefix}.bbox.json'
+            result_files['proposal'] = f'{outfile_prefix}.bbox.json'
+            result_files['segm'] = f'{outfile_prefix}.segm.json'
+            mmcv.dump(bbox_json_results, result_files['bbox'])
+            mmcv.dump(segm_json_results, result_files['segm'])
 
         return result_files
 
@@ -476,8 +542,16 @@ def evaluate_pan_json(self,
                 for k, v in zip(self.CLASSES, pq_results['classwise'].values())
             }
         print_panoptic_table(pq_results, classwise_results, logger=logger)
+        results = parse_pq_results(pq_results)
+        results['PQ_copypaste'] = (
+            f'{results["PQ"]:.3f} {results["SQ"]:.3f} '
+            f'{results["RQ"]:.3f} '
+            f'{results["PQ_th"]:.3f} {results["SQ_th"]:.3f} '
+            f'{results["RQ_th"]:.3f} '
+            f'{results["PQ_st"]:.3f} {results["SQ_st"]:.3f} '
+            f'{results["RQ_st"]:.3f}')
 
-        return parse_pq_results(pq_results)
+        return results
 
     def evaluate(self,
                  results,
@@ -491,8 +565,8 @@ def evaluate(self,
 
         Args:
             results (list[dict]): Testing results of the dataset.
-            metric (str | list[str]): Metrics to be evaluated. Only
-                support 'PQ' at present. 'pq' will be regarded as 'PQ.
+            metric (str | list[str]): Metrics to be evaluated. 'PQ', 'bbox',
+                'segm', 'proposal' are supported. 'pq' will be regarded as 'PQ.
             logger (logging.Logger | str | None): Logger used for printing
                 related information during evaluation. Default: None.
             jsonfile_prefix (str | None): The prefix of json files. It includes
@@ -510,7 +584,7 @@ def evaluate(self,
         metrics = metric if isinstance(metric, list) else [metric]
         # Compatible with lowercase 'pq'
         metrics = ['PQ' if metric == 'pq' else metric for metric in metrics]
-        allowed_metrics = ['PQ']  # todo: support other metrics like 'bbox'
+        allowed_metrics = ['PQ', 'bbox', 'segm', 'proposal']
         for metric in metrics:
             if metric not in allowed_metrics:
                 raise KeyError(f'metric {metric} is not supported')
@@ -524,6 +598,25 @@ def evaluate(self,
             eval_pan_results = self.evaluate_pan_json(
                 result_files, outfile_prefix, logger, classwise, nproc=nproc)
             eval_results.update(eval_pan_results)
+            metrics.remove('PQ')
+
+        if (('bbox' in metrics) or ('segm' in metrics)
+                or ('proposal' in metrics)):
+
+            assert 'ins_results' in results[0], 'instance segmentation' \
+                'results are absent from results'
+
+            assert self.ins_ann_file is not None, 'Annotation '\
+                'file for instance segmentation or object detection ' \
+                'shuold not be None'
+
+            coco_gt = COCO(self.ins_ann_file)
+            self.cat_ids = coco_gt.get_cat_ids(cat_names=self.THING_CLASSES)
+
+            eval_ins_results = self.evaluate_det_segm(results, result_files,
+                                                      coco_gt, metrics, logger,
+                                                      classwise, **kwargs)
+            eval_results.update(eval_ins_results)
 
         if tmp_dir is not None:
             tmp_dir.cleanup()
diff --git a/tests/test_data/test_datasets/test_panoptic_dataset.py b/tests/test_data/test_datasets/test_panoptic_dataset.py
index fd571d219d1..d7e2edc919d 100644
--- a/tests/test_data/test_datasets/test_panoptic_dataset.py
+++ b/tests/test_data/test_datasets/test_panoptic_dataset.py
@@ -5,6 +5,7 @@
 import mmcv
 import numpy as np
 
+from mmdet.core import encode_mask_results
 from mmdet.datasets.api_wrappers import pq_compute_single_core
 from mmdet.datasets.coco_panoptic import INSTANCE_OFFSET, CocoPanopticDataset
 
@@ -337,3 +338,119 @@ def test_panoptic_evaluation():
     assert np.isclose(pq_all['sq'] * 100, 80.898)
     assert np.isclose(pq_all['rq'] * 100, 83.333)
     assert pq_all['n'] == 3
+
+
+def _create_instance_segmentation_gt_annotations(ann_file):
+    categories = [{
+        'id': 0,
+        'name': 'person',
+        'supercategory': 'person',
+        'isthing': 1
+    }, {
+        'id': 1,
+        'name': 'dog',
+        'supercategory': 'dog',
+        'isthing': 1
+    }, {
+        'id': 2,
+        'name': 'wall',
+        'supercategory': 'wall',
+        'isthing': 0
+    }]
+
+    images = [{
+        'id': 0,
+        'width': 80,
+        'height': 60,
+        'file_name': 'fake_name1.jpg',
+    }]
+
+    person1_polygon = [10, 10, 20, 10, 20, 50, 10, 50, 10, 10]
+    person2_polygon = [30, 10, 40, 10, 40, 50, 30, 50, 30, 10]
+    dog_polygon = [50, 10, 60, 10, 60, 15, 50, 15, 50, 10]
+
+    annotations = [
+        {
+            'id': 0,
+            'image_id': 0,
+            'category_id': 0,
+            'segmentation': [person1_polygon],
+            'area': 400,
+            'bbox': [10, 10, 10, 40],
+            'iscrowd': 0
+        },
+        {
+            'id': 1,
+            'image_id': 0,
+            'category_id': 0,
+            'segmentation': [person2_polygon],
+            'area': 400,
+            'bbox': [30, 10, 10, 40],
+            'iscrowd': 0
+        },
+        {
+            'id': 2,
+            'image_id': 0,
+            'category_id': 1,
+            'segmentation': [dog_polygon],
+            'area': 50,
+            'bbox': [50, 10, 10, 5],
+            'iscrowd': 0
+        },
+    ]
+
+    gt_json = {
+        'images': images,
+        'annotations': annotations,
+        'categories': categories
+    }
+
+    mmcv.dump(gt_json, ann_file)
+
+
+def test_instance_segmentation_evaluation():
+    pred_bbox = [
+        np.array([[11, 10, 20, 50, 0.8], [31, 10, 40, 50, 0.8]]),
+        np.array([[51, 10, 60, 15, 0.7]])
+    ]
+
+    person1_mask = np.zeros((60, 80), dtype=bool)
+    person1_mask[20:50, 11:20] = True
+    person2_mask = np.zeros((60, 80), dtype=bool)
+    person2_mask[20:50, 31:40] = True
+    dog_mask = np.zeros((60, 80), dtype=bool)
+    dog_mask[10:15, 51:60] = True
+
+    pred_mask = [[person1_mask, person2_mask], [
+        dog_mask,
+    ]]
+    results = [{'ins_results': (pred_bbox, encode_mask_results(pred_mask))}]
+
+    tmp_dir = tempfile.TemporaryDirectory()
+    pan_ann_file = osp.join(tmp_dir.name, 'panoptic.json')
+    ins_ann_file = osp.join(tmp_dir.name, 'instance.json')
+    _create_panoptic_gt_annotations(pan_ann_file)
+    _create_instance_segmentation_gt_annotations(ins_ann_file)
+
+    dataset = CocoPanopticDataset(
+        ann_file=pan_ann_file,
+        ins_ann_file=ins_ann_file,
+        seg_prefix=tmp_dir.name,
+        pipeline=[])
+    dataset.THING_CLASSES = ['person', 'dog']
+    dataset.STUFF_CLASSES = ['wall']
+    dataset.CLASSES = dataset.THING_CLASSES + dataset.STUFF_CLASSES
+    parsed_results = dataset.evaluate(results, metric=['segm', 'bbox'])
+
+    # Here is the results for instance segmentation:
+    # {
+    #     'segm_mAP': 0.5, 'segm_mAP_50': 0.626, 'segm_mAP_75': 0.5,
+    #     'segm_mAP_s': 0.5, 'segm_mAP_m': -1.0, 'segm_mAP_l': -1.0,
+    #     'segm_mAP_copypaste': '0.500 0.626 0.500 0.500 -1.000 -1.000',
+    #     'bbox_mAP': 0.564, 'bbox_mAP_50': 0.626, 'bbox_mAP_75': 0.626,
+    #     'bbox_mAP_s': 0.564, 'bbox_mAP_m': -1.0, 'bbox_mAP_l': -1.0,
+    #     'bbox_mAP_copypaste': '0.564 0.626 0.626 0.564 -1.000 -1.000'
+    # }
+
+    assert np.isclose(parsed_results['segm_mAP'], 0.5)
+    assert np.isclose(parsed_results['bbox_mAP'], 0.564)

From 57f63bc0dec539447ab8fea84395a35b5e3fc8ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Haian=20Huang=28=E6=B7=B1=E5=BA=A6=E7=9C=B8=29?=
 <1286304229@qq.com>
Date: Wed, 16 Mar 2022 23:23:44 +0800
Subject: [PATCH 13/42] [Feature] Add diff seeds to diff ranks. (#7432)

---
 mmdet/core/utils/__init__.py                  |  4 +--
 mmdet/core/utils/dist_utils.py                | 34 +++++++++++++++++++
 mmdet/datasets/builder.py                     |  2 ++
 .../datasets/samplers/distributed_sampler.py  |  8 +++--
 mmdet/datasets/samplers/infinite_sampler.py   | 12 +++++--
 tools/train.py                                |  6 ++++
 6 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/mmdet/core/utils/__init__.py b/mmdet/core/utils/__init__.py
index a95ac048f67..3f0d07081a2 100644
--- a/mmdet/core/utils/__init__.py
+++ b/mmdet/core/utils/__init__.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .dist_utils import (DistOptimizerHook, all_reduce_dict, allreduce_grads,
-                         reduce_mean)
+                         reduce_mean, sync_random_seed)
 from .misc import (center_of_mass, filter_scores_and_topk, flip_tensor,
                    generate_coordinate, mask2ndarray, multi_apply,
                    select_single_mlvl, unmap)
@@ -9,5 +9,5 @@
     'allreduce_grads', 'DistOptimizerHook', 'reduce_mean', 'multi_apply',
     'unmap', 'mask2ndarray', 'flip_tensor', 'all_reduce_dict',
     'center_of_mass', 'generate_coordinate', 'select_single_mlvl',
-    'filter_scores_and_topk'
+    'filter_scores_and_topk', 'sync_random_seed'
 ]
diff --git a/mmdet/core/utils/dist_utils.py b/mmdet/core/utils/dist_utils.py
index 18b6870d05f..27ecbb004c5 100644
--- a/mmdet/core/utils/dist_utils.py
+++ b/mmdet/core/utils/dist_utils.py
@@ -4,6 +4,7 @@
 import warnings
 from collections import OrderedDict
 
+import numpy as np
 import torch
 import torch.distributed as dist
 from mmcv.runner import OptimizerHook, get_dist_info
@@ -151,3 +152,36 @@ def all_reduce_dict(py_dict, op='sum', group=None, to_float=True):
     if isinstance(py_dict, OrderedDict):
         out_dict = OrderedDict(out_dict)
     return out_dict
+
+
+def sync_random_seed(seed=None, device='cuda'):
+    """Make sure different ranks share the same seed.
+
+    All workers must call this function, otherwise it will deadlock.
+    This method is generally used in `DistributedSampler`,
+    because the seed should be identical across all processes
+    in the distributed group.
+
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is None:
+        seed = np.random.randint(2**31)
+    assert isinstance(seed, int)
+
+    rank, world_size = get_dist_info()
+
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
diff --git a/mmdet/datasets/builder.py b/mmdet/datasets/builder.py
index 30e1ee91a05..9cefb820358 100644
--- a/mmdet/datasets/builder.py
+++ b/mmdet/datasets/builder.py
@@ -6,6 +6,7 @@
 from functools import partial
 
 import numpy as np
+import torch
 from mmcv.parallel import collate
 from mmcv.runner import get_dist_info
 from mmcv.utils import TORCH_VERSION, Registry, build_from_cfg, digit_version
@@ -197,3 +198,4 @@ def worker_init_fn(worker_id, num_workers, rank, seed):
     worker_seed = num_workers * rank + worker_id + seed
     np.random.seed(worker_seed)
     random.seed(worker_seed)
+    torch.manual_seed(worker_seed)
diff --git a/mmdet/datasets/samplers/distributed_sampler.py b/mmdet/datasets/samplers/distributed_sampler.py
index d9c25bacf0c..3ed21bdb2c4 100644
--- a/mmdet/datasets/samplers/distributed_sampler.py
+++ b/mmdet/datasets/samplers/distributed_sampler.py
@@ -4,6 +4,8 @@
 import torch
 from torch.utils.data import DistributedSampler as _DistributedSampler
 
+from mmdet.core.utils import sync_random_seed
+
 
 class DistributedSampler(_DistributedSampler):
 
@@ -15,8 +17,10 @@ def __init__(self,
                  seed=0):
         super().__init__(
             dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
-        # for the compatibility from PyTorch 1.3+
-        self.seed = seed if seed is not None else 0
+        #  Must be the same across all workers. If None, will use a
+        #  random seed shared among workers
+        #  (require synchronization among all workers)
+        self.seed = sync_random_seed(seed)
 
     def __iter__(self):
         # deterministically shuffle based on epoch
diff --git a/mmdet/datasets/samplers/infinite_sampler.py b/mmdet/datasets/samplers/infinite_sampler.py
index 421c0de3369..cfea01a345d 100644
--- a/mmdet/datasets/samplers/infinite_sampler.py
+++ b/mmdet/datasets/samplers/infinite_sampler.py
@@ -6,6 +6,8 @@
 from mmcv.runner import get_dist_info
 from torch.utils.data.sampler import Sampler
 
+from mmdet.core.utils import sync_random_seed
+
 
 class InfiniteGroupBatchSampler(Sampler):
     """Similar to `BatchSampler` warping a `GroupSampler. It is designed for
@@ -48,7 +50,10 @@ def __init__(self,
         self.world_size = world_size
         self.dataset = dataset
         self.batch_size = batch_size
-        self.seed = seed if seed is not None else 0
+        #  Must be the same across all workers. If None, will use a
+        #  random seed shared among workers
+        #  (require synchronization among all workers)
+        self.seed = sync_random_seed(seed)
         self.shuffle = shuffle
 
         assert hasattr(self.dataset, 'flag')
@@ -133,7 +138,10 @@ def __init__(self,
         self.world_size = world_size
         self.dataset = dataset
         self.batch_size = batch_size
-        self.seed = seed if seed is not None else 0
+        #  Must be the same across all workers. If None, will use a
+        #  random seed shared among workers
+        #  (require synchronization among all workers)
+        self.seed = sync_random_seed(seed)
         self.shuffle = shuffle
         self.size = len(dataset)
         self.indices = self._indices_of_rank()
diff --git a/tools/train.py b/tools/train.py
index b9e99815e41..2ccc1c88f84 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -8,6 +8,7 @@
 
 import mmcv
 import torch
+import torch.distributed as dist
 from mmcv import Config, DictAction
 from mmcv.runner import get_dist_info, init_dist
 from mmcv.utils import get_git_hash
@@ -52,6 +53,10 @@ def parse_args():
         help='id of gpu to use '
         '(only applicable to non-distributed training)')
     parser.add_argument('--seed', type=int, default=None, help='random seed')
+    parser.add_argument(
+        '--diff-seed',
+        action='store_true',
+        help='Whether or not set different seeds for different ranks')
     parser.add_argument(
         '--deterministic',
         action='store_true',
@@ -169,6 +174,7 @@ def main():
 
     # set random seeds
     seed = init_random_seed(args.seed)
+    seed = seed + dist.get_rank() if args.diff_seed else seed
     logger.info(f'Set random seed to {seed}, '
                 f'deterministic: {args.deterministic}')
     set_random_seed(seed, deterministic=args.deterministic)

From a23b6b16ef7287d5ac53a9ee5deb6735992e230b Mon Sep 17 00:00:00 2001
From: Yue Zhou <592267829@qq.com>
Date: Wed, 16 Mar 2022 23:24:31 +0800
Subject: [PATCH 14/42] Enhance the robustness of analyze_logs.py (#7407)

---
 tools/analysis_tools/analyze_logs.py | 29 +++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 tools/analysis_tools/analyze_logs.py

diff --git a/tools/analysis_tools/analyze_logs.py b/tools/analysis_tools/analyze_logs.py
old mode 100644
new mode 100755
index 8ca81d38062..451f1d64d2c
--- a/tools/analysis_tools/analyze_logs.py
+++ b/tools/analysis_tools/analyze_logs.py
@@ -17,6 +17,10 @@ def cal_train_time(log_dicts, args):
                 all_times.append(log_dict[epoch]['time'])
             else:
                 all_times.append(log_dict[epoch]['time'][1:])
+        if not all_times:
+            raise KeyError(
+                'Please reduce the log interval in the config so that'
+                'interval is less than iterations of one epoch.')
         all_times = np.array(all_times)
         epoch_ave_time = all_times.mean(-1)
         slowest_epoch = epoch_ave_time.argmax()
@@ -50,12 +54,21 @@ def plot_curve(log_dicts, args):
         epochs = list(log_dict.keys())
         for j, metric in enumerate(metrics):
             print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
-            if metric not in log_dict[epochs[0]]:
+            if metric not in log_dict[epochs[int(args.start_epoch) - 1]]:
+                if 'mAP' in metric:
+                    raise KeyError(
+                        f'{args.json_logs[i]} does not contain metric '
+                        f'{metric}. Please check if "--no-validate" is '
+                        'specified when you trained the model.')
                 raise KeyError(
-                    f'{args.json_logs[i]} does not contain metric {metric}')
+                    f'{args.json_logs[i]} does not contain metric {metric}. '
+                    'Please reduce the log interval in the config so that '
+                    'interval is less than iterations of one epoch.')
 
             if 'mAP' in metric:
-                xs = np.arange(1, max(epochs) + 1)
+                xs = np.arange(
+                    int(args.start_epoch),
+                    max(epochs) + 1, int(args.eval_interval))
                 ys = []
                 for epoch in epochs:
                     ys += log_dict[epoch][metric]
@@ -104,6 +117,16 @@ def add_plot_parser(subparsers):
         nargs='+',
         default=['bbox_mAP'],
         help='the metric that you want to plot')
+    parser_plt.add_argument(
+        '--start-epoch',
+        type=str,
+        default='1',
+        help='the epoch that you want to start')
+    parser_plt.add_argument(
+        '--eval-interval',
+        type=str,
+        default='1',
+        help='the eval interval when training')
     parser_plt.add_argument('--title', type=str, help='title of figure')
     parser_plt.add_argument(
         '--legend',

From 95f199ccfe2b1838bbbeed00dbd9d0eb0f3a82c6 Mon Sep 17 00:00:00 2001
From: Yue Zhou <592267829@qq.com>
Date: Wed, 16 Mar 2022 23:24:57 +0800
Subject: [PATCH 15/42] [Feature] Add multi machine dist_train (#7415)

---
 docs/en/1_exist_data_model.md    | 19 ++++++++++++++++---
 docs/zh_cn/1_exist_data_model.md | 20 +++++++++++++++++---
 tools/dist_test.sh               | 16 ++++++++++++++--
 tools/dist_train.sh              | 15 +++++++++++++--
 4 files changed, 60 insertions(+), 10 deletions(-)

diff --git a/docs/en/1_exist_data_model.md b/docs/en/1_exist_data_model.md
index ce1be7bf8ad..02c13e0e3e5 100644
--- a/docs/en/1_exist_data_model.md
+++ b/docs/en/1_exist_data_model.md
@@ -584,10 +584,23 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4
 CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4
 ```
 
-### Training on multiple nodes
+### Train with multiple machines
 
-MMDetection relies on `torch.distributed` package for distributed training.
-Thus, as a basic usage, one can launch distributed training via PyTorch's [launch utility](https://pytorch.org/docs/stable/distributed.html#launch-utility).
+If you launch with multiple machines simply connected with ethernet, you can simply run following commands:
+
+On the first machine:
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_train.sh $CONFIG $GPUS
+```
+
+On the second machine:
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_train.sh $CONFIG $GPUS
+```
+
+Usually it is slow if you do not have high speed networking like InfiniBand.
 
 ### Manage jobs with Slurm
 
diff --git a/docs/zh_cn/1_exist_data_model.md b/docs/zh_cn/1_exist_data_model.md
index 9b1e9cc86d8..05322604517 100644
--- a/docs/zh_cn/1_exist_data_model.md
+++ b/docs/zh_cn/1_exist_data_model.md
@@ -566,11 +566,25 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4
 CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4
 ```
 
-#### 在多个节点上训练
+### 使用多台机器训练
 
-MMDetection 是依赖 `torch.distributed` 包进行分布式训练的。因此，我们可以通过 PyTorch 的 [启动工具](https://pytorch.org/docs/stable/distributed.html#launch-utility) 来进行基本地使用。
+如果您想使用由 ethernet 连接起来的多台机器， 您可以使用以下命令:
 
-#### 使用 Slurm 来管理任务
+在第一台机器上:
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_train.sh $CONFIG $GPUS
+```
+
+在第二台机器上:
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_train.sh $CONFIG $GPUS
+```
+
+但是，如果您不使用高速网路连接这几台机器的话，训练将会非常慢。
+
+### 使用 Slurm 来管理任务
 
 Slurm 是一个常见的计算集群调度系统。在 Slurm 管理的集群上，你可以使用 `slurm.sh` 来开启训练任务。它既支持单节点训练也支持多节点训练。
 
diff --git a/tools/dist_test.sh b/tools/dist_test.sh
index 3c74ec6ecd1..dea131b43ea 100755
--- a/tools/dist_test.sh
+++ b/tools/dist_test.sh
@@ -3,8 +3,20 @@
 CONFIG=$1
 CHECKPOINT=$2
 GPUS=$3
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
 PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 
 PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
-python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
-    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/test.py \
+    $CONFIG \
+    $CHECKPOINT \
+    --launcher pytorch \
+    ${@:4}
diff --git a/tools/dist_train.sh b/tools/dist_train.sh
index 5b43fffbf28..aa71bf4ae98 100755
--- a/tools/dist_train.sh
+++ b/tools/dist_train.sh
@@ -2,8 +2,19 @@
 
 CONFIG=$1
 GPUS=$2
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
 PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 
 PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
-python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
-    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/train.py \
+    $CONFIG \
+    --seed 0 \
+    --launcher pytorch ${@:3}

From 86c7d8d375566064ac449d8ff7043db89f722b56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Haian=20Huang=28=E6=B7=B1=E5=BA=A6=E7=9C=B8=29?=
 <1286304229@qq.com>
Date: Fri, 18 Mar 2022 15:07:45 +0800
Subject: [PATCH 16/42] [Enchance] Supplementary notes of  sync_random_seed
 (#7440)

* Supplementary Notes

* update

* update

* update
---
 mmdet/core/utils/dist_utils.py                 |  6 ++++++
 mmdet/datasets/samplers/distributed_sampler.py | 14 +++++++++++---
 mmdet/datasets/samplers/infinite_sampler.py    | 18 ++++++++++++------
 3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/mmdet/core/utils/dist_utils.py b/mmdet/core/utils/dist_utils.py
index 27ecbb004c5..8760774fd90 100644
--- a/mmdet/core/utils/dist_utils.py
+++ b/mmdet/core/utils/dist_utils.py
@@ -162,6 +162,12 @@ def sync_random_seed(seed=None, device='cuda'):
     because the seed should be identical across all processes
     in the distributed group.
 
+    In distributed sampling, different ranks should sample non-overlapped
+    data in the dataset. Therefore, this function is used to make sure that
+    each rank shuffles the data indices in the same order based
+    on the same seed. Then different ranks could use different indices
+    to select non-overlapped data from the same data list.
+
     Args:
         seed (int, Optional): The seed. Default to None.
         device (str): The device where the seed will be put on.
diff --git a/mmdet/datasets/samplers/distributed_sampler.py b/mmdet/datasets/samplers/distributed_sampler.py
index 3ed21bdb2c4..ab544a9c469 100644
--- a/mmdet/datasets/samplers/distributed_sampler.py
+++ b/mmdet/datasets/samplers/distributed_sampler.py
@@ -17,15 +17,23 @@ def __init__(self,
                  seed=0):
         super().__init__(
             dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
-        #  Must be the same across all workers. If None, will use a
-        #  random seed shared among workers
-        #  (require synchronization among all workers)
+
+        # In distributed sampling, different ranks should sample
+        # non-overlapped data in the dataset. Therefore, this function
+        # is used to make sure that each rank shuffles the data indices
+        # in the same order based on the same seed. Then different ranks
+        # could use different indices to select non-overlapped data from the
+        # same data list.
         self.seed = sync_random_seed(seed)
 
     def __iter__(self):
         # deterministically shuffle based on epoch
         if self.shuffle:
             g = torch.Generator()
+            # When :attr:`shuffle=True`, this ensures all replicas
+            # use a different random ordering for each epoch.
+            # Otherwise, the next iteration of this sampler will
+            # yield the same ordering.
             g.manual_seed(self.epoch + self.seed)
             indices = torch.randperm(len(self.dataset), generator=g).tolist()
         else:
diff --git a/mmdet/datasets/samplers/infinite_sampler.py b/mmdet/datasets/samplers/infinite_sampler.py
index cfea01a345d..d42487e6ac0 100644
--- a/mmdet/datasets/samplers/infinite_sampler.py
+++ b/mmdet/datasets/samplers/infinite_sampler.py
@@ -50,9 +50,12 @@ def __init__(self,
         self.world_size = world_size
         self.dataset = dataset
         self.batch_size = batch_size
-        #  Must be the same across all workers. If None, will use a
-        #  random seed shared among workers
-        #  (require synchronization among all workers)
+        # In distributed sampling, different ranks should sample
+        # non-overlapped data in the dataset. Therefore, this function
+        # is used to make sure that each rank shuffles the data indices
+        # in the same order based on the same seed. Then different ranks
+        # could use different indices to select non-overlapped data from the
+        # same data list.
         self.seed = sync_random_seed(seed)
         self.shuffle = shuffle
 
@@ -138,9 +141,12 @@ def __init__(self,
         self.world_size = world_size
         self.dataset = dataset
         self.batch_size = batch_size
-        #  Must be the same across all workers. If None, will use a
-        #  random seed shared among workers
-        #  (require synchronization among all workers)
+        # In distributed sampling, different ranks should sample
+        # non-overlapped data in the dataset. Therefore, this function
+        # is used to make sure that each rank shuffles the data indices
+        # in the same order based on the same seed. Then different ranks
+        # could use different indices to select non-overlapped data from the
+        # same data list.
         self.seed = sync_random_seed(seed)
         self.shuffle = shuffle
         self.size = len(dataset)

From 86037650f243a1ab0a515a22e831ea5dcddd6a7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Haian=20Huang=28=E6=B7=B1=E5=BA=A6=E7=9C=B8=29?=
 <1286304229@qq.com>
Date: Sat, 19 Mar 2022 17:39:24 +0800
Subject: [PATCH 17/42] Unified name of orig as ori (#7456)

---
 mmdet/datasets/pipelines/instaboost.py | 4 ++--
 mmdet/datasets/pipelines/transforms.py | 6 +++---
 tools/analysis_tools/get_flops.py      | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/mmdet/datasets/pipelines/instaboost.py b/mmdet/datasets/pipelines/instaboost.py
index 08ff5417bcd..ca10c4c751f 100644
--- a/mmdet/datasets/pipelines/instaboost.py
+++ b/mmdet/datasets/pipelines/instaboost.py
@@ -98,7 +98,7 @@ def _parse_anns(self, results, anns, img):
 
     def __call__(self, results):
         img = results['img']
-        orig_type = img.dtype
+        ori_type = img.dtype
         anns = self._load_anns(results)
         if np.random.choice([0, 1], p=[1 - self.aug_ratio, self.aug_ratio]):
             try:
@@ -109,7 +109,7 @@ def __call__(self, results):
             anns, img = instaboost.get_new_data(
                 anns, img.astype(np.uint8), self.cfg, background=None)
 
-        results = self._parse_anns(results, anns, img.astype(orig_type))
+        results = self._parse_anns(results, anns, img.astype(ori_type))
         return results
 
     def __repr__(self):
diff --git a/mmdet/datasets/pipelines/transforms.py b/mmdet/datasets/pipelines/transforms.py
index 15f14779c46..2aedbb3ea4b 100644
--- a/mmdet/datasets/pipelines/transforms.py
+++ b/mmdet/datasets/pipelines/transforms.py
@@ -520,9 +520,9 @@ def __call__(self, results):
             random_shift_y = random.randint(-self.max_shift_px,
                                             self.max_shift_px)
             new_x = max(0, random_shift_x)
-            orig_x = max(0, -random_shift_x)
+            ori_x = max(0, -random_shift_x)
             new_y = max(0, random_shift_y)
-            orig_y = max(0, -random_shift_y)
+            ori_y = max(0, -random_shift_y)
 
             # TODO: support mask and semantic segmentation maps.
             for key in results.get('bbox_fields', []):
@@ -558,7 +558,7 @@ def __call__(self, results):
                 new_h = img_h - np.abs(random_shift_y)
                 new_w = img_w - np.abs(random_shift_x)
                 new_img[new_y:new_y + new_h, new_x:new_x + new_w] \
-                    = img[orig_y:orig_y + new_h, orig_x:orig_x + new_w]
+                    = img[ori_y:ori_y + new_h, ori_x:ori_x + new_w]
                 results[key] = new_img
 
         return results
diff --git a/tools/analysis_tools/get_flops.py b/tools/analysis_tools/get_flops.py
index 0ac59a50049..4df87323b4a 100644
--- a/tools/analysis_tools/get_flops.py
+++ b/tools/analysis_tools/get_flops.py
@@ -52,7 +52,7 @@ def main():
         h, w = args.shape
     else:
         raise ValueError('invalid input shape')
-    orig_shape = (3, h, w)
+    ori_shape = (3, h, w)
     divisor = args.size_divisor
     if divisor > 0:
         h = int(np.ceil(h / divisor)) * divisor
@@ -83,9 +83,9 @@ def main():
     split_line = '=' * 30
 
     if divisor > 0 and \
-            input_shape != orig_shape:
+            input_shape != ori_shape:
         print(f'{split_line}\nUse size divisor set input shape '
-              f'from {orig_shape} to {input_shape}\n')
+              f'from {ori_shape} to {input_shape}\n')
     print(f'{split_line}\nInput shape: {input_shape}\n'
           f'Flops: {flops}\nParams: {params}\n{split_line}')
     print('!!!Please be cautious if you use the results in papers. '

From 6e9f1579437433acfaea3a558b8deb3f6c6ef7b2 Mon Sep 17 00:00:00 2001
From: Shilong Zhang <61961338+jshilong@users.noreply.github.com>
Date: Tue, 22 Mar 2022 15:30:51 +0800
Subject: [PATCH 18/42] Remove duplicate link (#7480)

---
 docs/en/projects.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/en/projects.md b/docs/en/projects.md
index 220825dab68..7149b7e4f7a 100644
--- a/docs/en/projects.md
+++ b/docs/en/projects.md
@@ -50,7 +50,6 @@ Methods already supported and maintained by MMDetection are not listed.
 - A Ranking-based, Balanced Loss Function Unifying Classification and Localisation in Object Detection, NeurIPS2020 [[paper]](https://arxiv.org/abs/2009.13592)[[github]](https://github.com/kemaloksuz/aLRPLoss)
 - RelationNet++: Bridging Visual Representations for Object Detection via Transformer Decoder, NeurIPS2020 [[paper]](https://arxiv.org/abs/2010.15831)[[github]](https://github.com/microsoft/RelationNet2)
 - Generalized Focal Loss V2: Learning Reliable Localization Quality Estimation for Dense Object Detection, CVPR2021[[paper]](https://arxiv.org/abs/2011.12885)[[github]](https://github.com/implus/GFocalV2)
-- Instances as Queries, ICCV2021[[paper]](http://arxiv.org/abs/2105.01928)[[github]](https://github.com/hustvl/QueryInst)
 - Swin Transformer: Hierarchical Vision Transformer using Shifted Windows, ICCV2021[[paper]](https://arxiv.org/abs/2103.14030)[[github]](https://github.com/SwinTransformer/)
 - Focal Transformer: Focal Self-attention for Local-Global Interactions in Vision Transformers, NeurIPS2021[[paper]](https://arxiv.org/abs/2107.00641)[[github]](https://github.com/microsoft/Focal-Transformer)
 - End-to-End Semi-Supervised Object Detection with Soft Teacher, ICCV2021[[paper]](https://arxiv.org/abs/2106.09018)[[github]](https://github.com/microsoft/SoftTeacher)

From 4a960ece1fb50f06de3f1ac535e159800b0c7865 Mon Sep 17 00:00:00 2001
From: jbwang1997 <jbwang1997@gmail.com>
Date: Tue, 22 Mar 2022 23:12:53 +0800
Subject: [PATCH 19/42] [Fix] Adding comments for MaxIoUAssigner (#7464)

* Add comments

* update url

* Add description in docstring

* Update sphinx

* update commit
---
 mmdet/core/bbox/assigners/max_iou_assigner.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mmdet/core/bbox/assigners/max_iou_assigner.py b/mmdet/core/bbox/assigners/max_iou_assigner.py
index 28d9ba3a892..676421f7653 100644
--- a/mmdet/core/bbox/assigners/max_iou_assigner.py
+++ b/mmdet/core/bbox/assigners/max_iou_assigner.py
@@ -23,6 +23,11 @@ class MaxIoUAssigner(BaseAssigner):
         min_pos_iou (float): Minimum iou for a bbox to be considered as a
             positive bbox. Positive samples can have smaller IoU than
             pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+            `min_pos_iou` is set to avoid assigning bboxes that have extremely
+            small iou with GT as positive samples. It brings about 0.3 mAP
+            improvements in 1x schedule but does not affect the performance of
+            3x schedule. More comparisons can be found in
+            `PR #7464 <https://github.com/open-mmlab/mmdetection/pull/7464>`_.
         gt_max_assign_all (bool): Whether to assign all bboxes with the same
             highest overlap with some gt to that gt.
         ignore_iof_thr (float): IoF threshold for ignoring bboxes (if

From 7f5849ccb1a64e8bb0e6769ef7a6bf338381e9a7 Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Tue, 22 Mar 2022 23:14:46 +0800
Subject: [PATCH 20/42] [Enhance] Update docstring of cross entropy loss
 (#7472)

* update docstring of ce loss

update docstring

* update docstring
---
 mmdet/models/losses/cross_entropy_loss.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mmdet/models/losses/cross_entropy_loss.py b/mmdet/models/losses/cross_entropy_loss.py
index f3aca80f7a4..5777aebd290 100644
--- a/mmdet/models/losses/cross_entropy_loss.py
+++ b/mmdet/models/losses/cross_entropy_loss.py
@@ -81,8 +81,12 @@ def binary_cross_entropy(pred,
     """Calculate the binary CrossEntropy loss.
 
     Args:
-        pred (torch.Tensor): The prediction with shape (N, 1).
-        label (torch.Tensor): The learning label of the prediction.
+        pred (torch.Tensor): The prediction with shape (N, 1) or (N, ).
+            When the shape of pred is (N, 1), label will be expanded to
+            one-hot format, and when the shape of pred is (N, ), label
+            will not be expanded to one-hot format.
+        label (torch.Tensor): The learning label of the prediction,
+            with shape (N, ).
         weight (torch.Tensor, optional): Sample-wise loss weight.
         reduction (str, optional): The method used to reduce the loss.
             Options are "none", "mean" and "sum".

From 661356ebe99ea68ef91945f0e612f89606097c56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Haian=20Huang=28=E6=B7=B1=E5=BA=A6=E7=9C=B8=29?=
 <1286304229@qq.com>
Date: Tue, 22 Mar 2022 23:15:57 +0800
Subject: [PATCH 21/42] =?UTF-8?q?[Enchance]=20Added=20documentation=20on?=
 =?UTF-8?q?=20mmdet=20using=20mmcls=E2=80=98s=20backbone=20(#7438)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add how-to docs

* update

* update index.rst

* update

* update
---
 docs/en/tutorials/how_to.md    | 68 ++++++++++++++++++++++++++++++++++
 docs/en/tutorials/index.rst    |  1 +
 docs/zh_cn/tutorials/how_to.md | 66 +++++++++++++++++++++++++++++++++
 docs/zh_cn/tutorials/index.rst |  2 +
 4 files changed, 137 insertions(+)
 create mode 100644 docs/en/tutorials/how_to.md
 create mode 100644 docs/zh_cn/tutorials/how_to.md

diff --git a/docs/en/tutorials/how_to.md b/docs/en/tutorials/how_to.md
new file mode 100644
index 00000000000..c2043a46b54
--- /dev/null
+++ b/docs/en/tutorials/how_to.md
@@ -0,0 +1,68 @@
+# Tutorial 11: How to xxx
+
+This tutorial collects answers to any `How to xxx with MMDetection`. Feel free to update this doc if you meet new questions about `How to` and find the answers!
+
+## Use backbone network through MMClassification
+
+The model registry in MMDet, MMCls, MMSeg all inherit from the root registry in MMCV. This allows these repositories to directly use the modules already implemented by each other. Therefore, users can use backbone networks from MMClassification in MMDetection without implementing a network that already exists in MMClassification.
+
+### Use backbone network implemented in MMClassification
+
+Suppose you want to use `MobileNetV3-small` as the backbone network of `RetinaNet`, the example config is as the following.
+
+```python
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# please install mmcls>=0.20.0
+# import mmcls.models to trigger register_module in mmcls
+custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False)
+pretrained = 'https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True, # Delete the backbone field in _base_
+        type='mmcls.MobileNetV3', # Using MobileNetV3 from mmcls
+        arch='small',
+        out_indices=(3, 8, 11), # Modify out_indices
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=pretrained,
+            prefix='backbone.')), # The pre-trained weights of backbone network in MMCls have prefix='backbone.'. The prefix in the keys will be removed so that these weights can be normally loaded.
+    # Modify in_channels
+    neck=dict(in_channels=[24, 48, 96], start_level=0))
+```
+
+### Use backbone network in TIMM through MMClassification
+
+MMClassification also provides a wrapper for the PyTorch Image Models (timm) backbone network, users can directly use the backbone network in timm through MMClassification. Suppose you want to use EfficientNet-B1 as the backbone network of RetinaNet, the example config is as the following.
+
+```python
+# https://github.com/open-mmlab/mmdetection/blob/master/configs/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py
+
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# please install mmcls>=0.20.0
+# import mmcls.models to trigger register_module in mmcls
+custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False)
+model = dict(
+    backbone=dict(
+        _delete_=True, # Delete the backbone field in _base_
+        type='mmcls.TIMMBackbone', # Using timm from mmcls
+        model_name='efficientnet_b1',
+        features_only=True,
+        pretrained=True,
+        out_indices=(1, 2, 3, 4)), # Modify out_indices
+    neck=dict(in_channels=[24, 40, 112, 320])) # Modify in_channels
+
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+```
+
+`type='mmcls.TIMMBackbone'` means use the `TIMMBackbone` class from MMClassification in MMDetection, and the model used is `EfficientNet-B1`, where `mmcls` means the MMClassification repo and `TIMMBackbone` means the TIMMBackbone wrapper implemented in MMClassification.
+
+For the principle of the Hierarchy Registry, please refer to the [MMCV document](https://github.com/open-mmlab/mmcv/blob/master/docs/en/understand_mmcv/registry.md#hierarchy-registry). For how to use other backbones in MMClassification, you can refer to the [MMClassification document](https://github.com/open-mmlab/mmclassification/blob/master/docs/en/tutorials/config.md).
diff --git a/docs/en/tutorials/index.rst b/docs/en/tutorials/index.rst
index fc5ba3aaace..cb9f6eb714e 100644
--- a/docs/en/tutorials/index.rst
+++ b/docs/en/tutorials/index.rst
@@ -12,3 +12,4 @@
    pytorch2onnx.md
    onnx2tensorrt.md
    init_cfg.md
+   how_to.md
diff --git a/docs/zh_cn/tutorials/how_to.md b/docs/zh_cn/tutorials/how_to.md
new file mode 100644
index 00000000000..af020552644
--- /dev/null
+++ b/docs/zh_cn/tutorials/how_to.md
@@ -0,0 +1,66 @@
+# 教程 11: How to xxx
+本教程收集了任何如何使用 MMDetection 进行 xxx 的答案。 如果您遇到有关`如何做`的问题及答案，请随时更新此文档！
+
+## 使用 MMClassification 的骨干网络
+
+MMDet、MMCls、MMSeg 中的模型注册表都继承自 MMCV 中的根注册表，允许这些存储库直接使用彼此已经实现的模块。 因此用户可以在 MMDetection 中使用来自 MMClassification 的骨干网络，而无需实现MMClassification 中已经存在的网络。
+
+### 使用在 MMClassification 中实现的骨干网络
+
+假设想将 `MobileNetV3-small` 作为 `RetinaNet` 的骨干网络，则配置文件如下。
+
+```python
+通过 MMClassification 在 TIMM 中使用骨干网络_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# please install mmcls>=0.20.0
+# import mmcls.models to trigger register_module in mmcls
+custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False)
+pretrained = 'https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True, # 将 _base_ 中关于 backbone 的字段删除
+        type='mmcls.MobileNetV3', # 使用 mmcls 中的 MobileNetV3
+        arch='small',
+        out_indices=(3, 8, 11), # 修改 out_indices
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=pretrained,
+            prefix='backbone.')), # MMCls 中骨干网络的预训练权重含义 prefix='backbone.'，为了正常加载权重，需要把这个 prefix 去掉。
+    # 修改 in_channels
+    neck=dict(in_channels=[24, 48, 96], start_level=0))
+```
+
+### 通过 MMClassification 使用 TIMM 中实现的骨干网络
+
+由于 MMClassification 提供了 Py**T**orch **Im**age **M**odels (`timm`) 骨干网络的封装，用户也可以通过 MMClassification 直接使用 `timm` 中的骨干网络。假设想将 [`EfficientNet-B1`](https://github.com/open-mmlab/mmdetection/blob/master/configs/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py) 作为 `RetinaNet` 的骨干网络，则配置文件如下。
+
+```python
+# https://github.com/open-mmlab/mmdetection/blob/master/configs/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# please install mmcls>=0.20.0
+# import mmcls.models to trigger register_module in mmcls
+custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False)
+model = dict(
+    backbone=dict(
+        _delete_=True, # 将 _base_ 中关于 backbone 的字段删除
+        type='mmcls.TIMMBackbone', # 使用 mmcls 中 timm 骨干网络
+        model_name='efficientnet_b1',
+        features_only=True,
+        pretrained=True,
+        out_indices=(1, 2, 3, 4)), # 修改 out_indices
+    neck=dict(in_channels=[24, 40, 112, 320])) # 修改 in_channels
+
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+```
+
+`type='mmcls.TIMMBackbone'` 表示在 MMDetection 中使用 MMClassification 中的 `TIMMBackbone` 类，并且使用的模型为` EfficientNet-B1`，其中 `mmcls` 表示 MMClassification 库，而 `TIMMBackbone ` 表示 MMClassification 中实现的 TIMMBackbone 包装器。
+
+关于层次注册器的具体原理可以参考 [MMCV 文档](https://github.com/open-mmlab/mmcv/blob/master/docs/zh_cn/understand_mmcv/registry.md#%E6%B3%A8%E5%86%8C%E5%99%A8%E5%B1%82%E7%BB%93%E6%9E%84)，关于如何使用 MMClassification 中的其他 backbone，可以参考 [MMClassification 文档](https://github.com/open-mmlab/mmclassification/blob/master/docs/zh_CN/tutorials/config.md)。
diff --git a/docs/zh_cn/tutorials/index.rst b/docs/zh_cn/tutorials/index.rst
index 659a5cb6da6..eaf49074007 100644
--- a/docs/zh_cn/tutorials/index.rst
+++ b/docs/zh_cn/tutorials/index.rst
@@ -10,3 +10,5 @@
    finetune.md
    pytorch2onnx.md
    onnx2tensorrt.md
+   init_cfg.md
+   how_to.md

From b857faa7e471ce3cbf19f144b13b421ef542b3c6 Mon Sep 17 00:00:00 2001
From: Osman F Bayram <74963545+osbm@users.noreply.github.com>
Date: Wed, 23 Mar 2022 03:53:52 +0300
Subject: [PATCH 22/42] Update README.md (#7498)

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 98db6045355..d739ac5504e 100644
--- a/README.md
+++ b/README.md
@@ -2,17 +2,17 @@
   <img src="resources/mmdet-logo.png" width="600"/>
   <div>&nbsp;</div>
   <div align="center">
-    <b><font size="5">OpenMMLab website</font></b>
+    <b>OpenMMLab website</b>
     <sup>
       <a href="https://openmmlab.com">
-        <i><font size="4">HOT</font></i>
+        <i>HOT</i>
       </a>
     </sup>
     &nbsp;&nbsp;&nbsp;&nbsp;
-    <b><font size="5">OpenMMLab platform</font></b>
+    <b>OpenMMLab platform</b>
     <sup>
       <a href="https://platform.openmmlab.com">
-        <i><font size="4">TRY IT OUT</font></i>
+        <i>TRY IT OUT</i>
       </a>
     </sup>
   </div>

From a03fc76228ee38169cfeb13b8b0fca356e56906e Mon Sep 17 00:00:00 2001
From: Diego <ceroytres@users.noreply.github.com>
Date: Wed, 23 Mar 2022 00:49:47 -0700
Subject: [PATCH 23/42] fix typos (#7510)

---
 configs/selfsup_pretrain/README.md | 2 +-
 docs/en/tutorials/config.md        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/selfsup_pretrain/README.md b/configs/selfsup_pretrain/README.md
index 2d9530db695..cff4cfc9b61 100644
--- a/configs/selfsup_pretrain/README.md
+++ b/configs/selfsup_pretrain/README.md
@@ -43,7 +43,7 @@ To use the ResNet-50 backbone released by SwAV, you can download it from [here](
 
 ### Modify config
 
-The backbone requires SyncBN and the `fronzen_stages` need to be changed. A config that use the moco backbone is as below
+The backbone requires SyncBN and the `frozen_stages` need to be changed. A config that use the moco backbone is as below
 
 ```python
 _base_ = [
diff --git a/docs/en/tutorials/config.md b/docs/en/tutorials/config.md
index c229cd646b7..71f0540f4b1 100644
--- a/docs/en/tutorials/config.md
+++ b/docs/en/tutorials/config.md
@@ -104,7 +104,7 @@ model = dict(
         depth=50,  # The depth of backbone, usually it is 50 or 101 for ResNet and ResNext backbones.
         num_stages=4,  # Number of stages of the backbone.
         out_indices=(0, 1, 2, 3),  # The index of output feature maps produced in each stages
-        frozen_stages=1,  # The weights in the first 1 stage are fronzen
+        frozen_stages=1,  # The weights in the first 1 stage are frozen
         norm_cfg=dict(  # The config of normalization layers.
             type='BN',  # Type of norm layer, usually it is BN or GN
             requires_grad=True),  # Whether to train the gamma and beta in BN

From eb86b6538f2f52198c34860b24b517f9e78ae13d Mon Sep 17 00:00:00 2001
From: Noah <37148740+bunge-bedstraw-herb@users.noreply.github.com>
Date: Wed, 23 Mar 2022 15:53:13 +0800
Subject: [PATCH 24/42] update-mosaic-to-how-to (#7507)

---
 docs/en/tutorials/how_to.md    | 46 ++++++++++++++++++++++++++++++++++
 docs/zh_cn/tutorials/how_to.md | 46 ++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)

diff --git a/docs/en/tutorials/how_to.md b/docs/en/tutorials/how_to.md
index c2043a46b54..71b6a1de8f9 100644
--- a/docs/en/tutorials/how_to.md
+++ b/docs/en/tutorials/how_to.md
@@ -66,3 +66,49 @@ optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
 `type='mmcls.TIMMBackbone'` means use the `TIMMBackbone` class from MMClassification in MMDetection, and the model used is `EfficientNet-B1`, where `mmcls` means the MMClassification repo and `TIMMBackbone` means the TIMMBackbone wrapper implemented in MMClassification.
 
 For the principle of the Hierarchy Registry, please refer to the [MMCV document](https://github.com/open-mmlab/mmcv/blob/master/docs/en/understand_mmcv/registry.md#hierarchy-registry). For how to use other backbones in MMClassification, you can refer to the [MMClassification document](https://github.com/open-mmlab/mmclassification/blob/master/docs/en/tutorials/config.md).
+
+## Use Mosaic augmentation
+
+If you want to use `Mosaic` in training, please make sure that you use `MultiImageMixDataset` at the same time. Taking the 'Faster R-CNN' algorithm as an example, you should modify the values of `train_pipeline` and `train_dataset` in the config as below:
+
+```python
+# Open configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py directly and add the following fields
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+img_scale=(1333, 800)​
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+train_pipeline = [
+    dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2)), # The image will be enlarged by 4 times after Mosaic processing,so we use affine transformation to restore the image size.
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+
+train_dataset = dict(
+    _delete_ = True, # remove unnecessary Settings
+    type='MultiImageMixDataset',
+    dataset=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True)
+        ],
+        filter_empty_gt=False,
+    ),
+    pipeline=train_pipeline
+    )
+​
+data = dict(
+    train=train_dataset
+    )
+```
diff --git a/docs/zh_cn/tutorials/how_to.md b/docs/zh_cn/tutorials/how_to.md
index af020552644..2fa749cc6a4 100644
--- a/docs/zh_cn/tutorials/how_to.md
+++ b/docs/zh_cn/tutorials/how_to.md
@@ -64,3 +64,49 @@ optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
 `type='mmcls.TIMMBackbone'` 表示在 MMDetection 中使用 MMClassification 中的 `TIMMBackbone` 类，并且使用的模型为` EfficientNet-B1`，其中 `mmcls` 表示 MMClassification 库，而 `TIMMBackbone ` 表示 MMClassification 中实现的 TIMMBackbone 包装器。
 
 关于层次注册器的具体原理可以参考 [MMCV 文档](https://github.com/open-mmlab/mmcv/blob/master/docs/zh_cn/understand_mmcv/registry.md#%E6%B3%A8%E5%86%8C%E5%99%A8%E5%B1%82%E7%BB%93%E6%9E%84)，关于如何使用 MMClassification 中的其他 backbone，可以参考 [MMClassification 文档](https://github.com/open-mmlab/mmclassification/blob/master/docs/zh_CN/tutorials/config.md)。
+
+## 使用马赛克数据增强
+
+如果你想在训练中使用 `Mosaic`，那么请确保你同时使用 `MultiImageMixDataset`。以 `Faster R-CNN` 算法为例，你可以通过如下做法实现：
+
+```python
+# 直接打开 configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py ,增添如下字段
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+img_scale=(1333, 800)​
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+train_pipeline = [
+    dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2)), # 图像经过马赛克处理后会放大4倍，所以我们使用仿射变换来恢复图像的大小。
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+
+train_dataset = dict(
+    _delete_ = True, # 删除不必要的设置
+    type='MultiImageMixDataset',
+    dataset=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True)
+        ],
+        filter_empty_gt=False,
+    ),
+    pipeline=train_pipeline
+    )
+​
+data = dict(
+    train=train_dataset
+    )
+```

From fb0ca872896e9d7430d9a0eb19e56b70f54594ff Mon Sep 17 00:00:00 2001
From: BigDong <yudongwang@tju.edu.cn>
Date: Wed, 23 Mar 2022 22:57:21 +0800
Subject: [PATCH 25/42] [Fix] Fix fpg link (#7478)

---
 configs/fpg/README.md    | 14 ++++++++------
 configs/fpg/metafile.yml | 22 +++++++++++-----------
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/configs/fpg/README.md b/configs/fpg/README.md
index 3e884fb74a4..9d89510fa57 100644
--- a/configs/fpg/README.md
+++ b/configs/fpg/README.md
@@ -19,12 +19,14 @@ All backbones are Resnet-50 in pytorch style.
 
 | Method       | Neck        | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download |
 |:------------:|:-----------:|:-------:|:--------:|:--------------:|:------:|:-------:|:-------:|:--------:|
-| Faster R-CNN | FPG         | 50e     | 20.0     | -              | 42.2   | -       |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/faster_rcnn_r50_fpg_crop640_50e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg_crop640_50e_coco/faster_rcnn_r50_fpg_crop640_50e_coco-76220505.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg_crop640_50e_coco/20210218_223520.log.json) |
-| Faster R-CNN | FPG-chn128  | 50e     | 11.9     | -              | 41.2   | -       |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco/faster_rcnn_r50_fpg-chn128_crop640_50e_coco-24257de9.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco/20210218_221412.log.json) |
-| Mask R-CNN   | FPG         | 50e     | 23.2     | -              | 42.7   | 37.8    |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/mask_rcnn_r50_fpg_crop640_50e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/mask_rcnn_r50_fpg_crop640_50e_coco-c5860453.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/20210222_205447.log.json) |
-| Mask R-CNN   | FPG-chn128  | 50e     | 15.3     | -              | 41.7   | 36.9    |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco/mask_rcnn_r50_fpg-chn128_crop640_50e_coco-5c6ea10d.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco/20210223_025039.log.json) |
-| RetinaNet    | FPG         | 50e     | 20.8     | -              | 40.5   | -       |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg_crop640_50e_coco/retinanet_r50_fpg_crop640_50e_coco-46fdd1c6.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg_crop640_50e_coco/20210225_143957.log.json) |
-| RetinaNet    | FPG-chn128  | 50e     | 19.9     | -              | 40.3   | -       |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco/retinanet_r50_fpg-chn128_crop640_50e_coco-5cf33c76.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco/20210225_184328.log.json) |
+| Faster R-CNN | FPG         | 50e     | 20.0     | -              | 42.3   | -       |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/faster_rcnn_r50_fpg_crop640_50e_coco.py)        |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg_crop640_50e_coco/faster_rcnn_r50_fpg_crop640_50e_coco_20220311_011856-74109f42.pth) &#124;               [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg_crop640_50e_coco/faster_rcnn_r50_fpg_crop640_50e_coco_20220311_011856.log.json) |
+| Faster R-CNN | FPG-chn128  | 50e     | 11.9     | -              | 41.2   | -       |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco/faster_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011857-9376aa9d.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco/faster_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011857.log.json) |
+| Faster R-CNN | FPN         | 50e     | 20.0     | -              | 38.9   | -       |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/faster_rcnn_r50_fpn_crop640_50e_coco.py)        |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpn_crop640_50e_coco/faster_rcnn_r50_fpn_crop640_50e_coco_20220311_011857-be7c9f42.pth) &#124;               [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpn_crop640_50e_coco/faster_rcnn_r50_fpn_crop640_50e_coco_20220311_011857.log.json)  |
+| Mask R-CNN   | FPG         | 50e     | 23.2     | -              | 43.0   | 38.1    |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/mask_rcnn_r50_fpg_crop640_50e_coco.py)          |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/mask_rcnn_r50_fpg_crop640_50e_coco_20220311_011857-233b8334.pth) &#124;                   [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/mask_rcnn_r50_fpg_crop640_50e_coco_20220311_011857.log.json) |
+| Mask R-CNN   | FPG-chn128  | 50e     | 15.3     | -              | 41.7   | 37.1    |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco.py)   |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco/mask_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011859-043c9b4e.pth) &#124;     [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco/mask_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011859.log.json)  |
+| Mask R-CNN   | FPN         | 50e     | 23.2     | -              | 49.6   | 35.6    |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/mask_rcnn_r50_fpn_crop640_50e_coco.py)          |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpn_crop640_50e_coco/mask_rcnn_r50_fpn_crop640_50e_coco_20220311_011855-a756664a.pth) &#124;                   [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpn_crop640_50e_coco/mask_rcnn_r50_fpn_crop640_50e_coco_20220311_011855.log.json) |
+| RetinaNet    | FPG         | 50e     | 20.8     | -              | 40.5   | -       |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py)          |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg_crop640_50e_coco/retinanet_r50_fpg_crop640_50e_coco_20220311_110809-b0bcf5f4.pth) &#124;                   [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg_crop640_50e_coco/retinanet_r50_fpg_crop640_50e_coco_20220311_110809.log.json) |
+| RetinaNet    | FPG-chn128  | 50e     | 19.9     | -              | 39.9   | -       |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py)   |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco/retinanet_r50_fpg-chn128_crop640_50e_coco_20220313_104829-ee99a686.pth) &#124;     [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco/retinanet_r50_fpg-chn128_crop640_50e_coco_20220313_104829.log.json) |
 
 **Note**: Chn128 means to decrease the number of channels of features and convs from 256 (default) to 128 in
 Neck and BBox Head, which can greatly decrease memory consumption without sacrificing much precision.
diff --git a/configs/fpg/metafile.yml b/configs/fpg/metafile.yml
index 885d8573631..6b0a6a796d3 100644
--- a/configs/fpg/metafile.yml
+++ b/configs/fpg/metafile.yml
@@ -27,8 +27,8 @@ Models:
       - Task: Object Detection
         Dataset: COCO
         Metrics:
-          box AP: 42.2
-    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg_crop640_50e_coco/faster_rcnn_r50_fpg_crop640_50e_coco-76220505.pth
+          box AP: 42.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg_crop640_50e_coco/faster_rcnn_r50_fpg_crop640_50e_coco_20220311_011856-74109f42.pth
 
   - Name: faster_rcnn_r50_fpg-chn128_crop640_50e_coco
     In Collection: Feature Pyramid Grids
@@ -41,7 +41,7 @@ Models:
         Dataset: COCO
         Metrics:
           box AP: 41.2
-    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco/faster_rcnn_r50_fpg-chn128_crop640_50e_coco-24257de9.pth
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco/faster_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011857-9376aa9d.pth
 
   - Name: mask_rcnn_r50_fpg_crop640_50e_coco
     In Collection: Feature Pyramid Grids
@@ -53,12 +53,12 @@ Models:
       - Task: Object Detection
         Dataset: COCO
         Metrics:
-          box AP: 42.7
+          box AP: 43.0
       - Task: Instance Segmentation
         Dataset: COCO
         Metrics:
-          mask AP:  37.8
-    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/mask_rcnn_r50_fpg_crop640_50e_coco-c5860453.pth
+          mask AP:  38.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/mask_rcnn_r50_fpg_crop640_50e_coco_20220311_011857-233b8334.pth
 
   - Name: mask_rcnn_r50_fpg-chn128_crop640_50e_coco
     In Collection: Feature Pyramid Grids
@@ -74,8 +74,8 @@ Models:
       - Task: Instance Segmentation
         Dataset: COCO
         Metrics:
-          mask AP:  36.9
-    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco/mask_rcnn_r50_fpg-chn128_crop640_50e_coco-5c6ea10d.pth
+          mask AP:  37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco/mask_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011859-043c9b4e.pth
 
   - Name: retinanet_r50_fpg_crop640_50e_coco
     In Collection: Feature Pyramid Grids
@@ -88,7 +88,7 @@ Models:
         Dataset: COCO
         Metrics:
           box AP: 40.5
-    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg_crop640_50e_coco/retinanet_r50_fpg_crop640_50e_coco-46fdd1c6.pth
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg_crop640_50e_coco/retinanet_r50_fpg_crop640_50e_coco_20220311_110809-b0bcf5f4.pth
 
   - Name: retinanet_r50_fpg-chn128_crop640_50e_coco
     In Collection: Feature Pyramid Grids
@@ -100,5 +100,5 @@ Models:
       - Task: Object Detection
         Dataset: COCO
         Metrics:
-          box AP: 40.3
-    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco/retinanet_r50_fpg-chn128_crop640_50e_coco-5cf33c76.pth
+          box AP: 39.9
+    Weights:  https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco/retinanet_r50_fpg-chn128_crop640_50e_coco_20220313_104829-ee99a686.pth

From f2552e0421de1dd4fd8dc5c298001fe3cdf0ecb9 Mon Sep 17 00:00:00 2001
From: Noah <37148740+bunge-bedstraw-herb@users.noreply.github.com>
Date: Wed, 23 Mar 2022 22:58:23 +0800
Subject: [PATCH 26/42] [Doc] Mosaic documentation enhancements (#7470)

* Mosaic documentation enhancements

- Added Mosaic and MultiImageMixDataset must be used together in FAQ document
- Added the demo configuration of Faster R-CNN using Mosaic

* Mosaic doc enhancements

correct some errors as suggested

* Remove unnecessary Settings

a little change

* update some details

* update-mosaic-to-how-to

* Mosaic documentation enhancements

- Added Mosaic and MultiImageMixDataset must be used together in FAQ document
- Added the demo configuration of Faster R-CNN using Mosaic

* Mosaic doc enhancements

correct some errors as suggested

* Remove unnecessary Settings

a little change

* update some details

* rebase dev branch

delete the mosaic in fap.md since it is in how_to.md

* rebase dev

* Mosaic documentation enhancements

- Added Mosaic and MultiImageMixDataset must be used together in FAQ document
- Added the demo configuration of Faster R-CNN using Mosaic

* Mosaic doc enhancements

correct some errors as suggested

* Remove unnecessary Settings

a little change

* update some details

* rebase dev branch

delete the mosaic in fap.md since it is in how_to.md

* Mosaic documentation enhancements

- Added Mosaic and MultiImageMixDataset must be used together in FAQ document
- Added the demo configuration of Faster R-CNN using Mosaic

* Mosaic doc enhancements

correct some errors as suggested

* Remove unnecessary Settings

a little change

* update some details

* rebase dev
---
 docs/en/faq.md    | 2 +-
 docs/zh_cn/faq.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/en/faq.md b/docs/en/faq.md
index 6ebc7d64106..e656ecd8dc4 100644
--- a/docs/en/faq.md
+++ b/docs/en/faq.md
@@ -75,7 +75,7 @@ We list some common troubles faced by many users and their corresponding solutio
     3. Extend the warmup iterations: some models are sensitive to the learning rate at the start of the training. You can extend the warmup iterations, e.g., change the `warmup_iters` from 500 to 1000 or 2000.
     4. Add gradient clipping: some models requires gradient clipping to stabilize the training process. The default of `grad_clip` is `None`, you can add gradient clippint to avoid gradients that are too large, i.e., set `optimizer_config=dict(_delete_=True, grad_clip=dict(max_norm=35, norm_type=2))` in your config file. If your config does not inherits from any basic config that contains `optimizer_config=dict(grad_clip=None)`, you can simply add `optimizer_config=dict(grad_clip=dict(max_norm=35, norm_type=2))`.
 
-- ’GPU out of memory"
+- "GPU out of memory"
     1. There are some scenarios when there are large amount of ground truth boxes, which may cause OOM during target assignment. You can set `gpu_assign_thr=N` in the config of assigner thus the assigner will calculate box overlaps through CPU when there are more than N GT boxes.
     2. Set `with_cp=True` in the backbone. This uses the sublinear strategy in PyTorch to reduce GPU memory cost in the backbone.
     3. Try mixed precision training using following the examples in `config/fp16`. The `loss_scale` might need further tuning for different models.
diff --git a/docs/zh_cn/faq.md b/docs/zh_cn/faq.md
index 1e5bcd9ee67..3376ce74830 100644
--- a/docs/zh_cn/faq.md
+++ b/docs/zh_cn/faq.md
@@ -77,7 +77,7 @@
     3. 延长 warm up 的时间：一些模型在训练初始时对学习率很敏感，您可以把 `warmup_iters` 从 500 更改为 1000 或 2000。
     4. 添加 gradient clipping: 一些模型需要梯度裁剪来稳定训练过程。 默认的 `grad_clip` 是 `None`,  你可以在 config 设置 `optimizer_config=dict(_delete_=True, grad_clip=dict(max_norm=35, norm_type=2))`  如果你的 config 没有继承任何包含 `optimizer_config=dict(grad_clip=None)`,  你可以直接设置`optimizer_config=dict(grad_clip=dict(max_norm=35, norm_type=2))`.
 
-- ’GPU out of memory"
+- "GPU out of memory"
     1. 存在大量 ground truth boxes 或者大量 anchor 的场景，可能在 assigner 会 OOM。 您可以在 assigner 的配置中设置 `gpu_assign_thr=N`，这样当超过 N 个 GT boxes 时，assigner 会通过 CPU 计算 IOU。
     2. 在 backbone 中设置 `with_cp=True`。 这使用 PyTorch 中的 `sublinear strategy` 来降低 backbone 占用的 GPU 显存。
     3. 使用 `config/fp16` 中的示例尝试混合精度训练。`loss_scale` 可能需要针对不同模型进行调整。

From 04906e8623f53b556ab4d6ad61733e2959a43ee9 Mon Sep 17 00:00:00 2001
From: Youth-Got <53550307+Youth-Got@users.noreply.github.com>
Date: Thu, 24 Mar 2022 19:06:59 +0800
Subject: [PATCH 27/42] Bug fix: RuntimeError: Expected all tensors to be on
 the same device, but found at least two devices, cuda:0 and cuda:2! (#7521)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复Free anchor Retinanet网络训练时，可能会遇到计算得到的anchor与pre_bboxes不在同一个cuda device的bug。
---
 mmdet/models/dense_heads/free_anchor_retina_head.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mmdet/models/dense_heads/free_anchor_retina_head.py b/mmdet/models/dense_heads/free_anchor_retina_head.py
index 644b7b37adc..fa4238974da 100644
--- a/mmdet/models/dense_heads/free_anchor_retina_head.py
+++ b/mmdet/models/dense_heads/free_anchor_retina_head.py
@@ -78,8 +78,8 @@ def loss(self,
         """
         featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
         assert len(featmap_sizes) == self.prior_generator.num_levels
-
-        anchor_list, _ = self.get_anchors(featmap_sizes, img_metas)
+        device = cls_scores[0].device
+        anchor_list, _ = self.get_anchors(featmap_sizes, img_metas, device=device)
         anchors = [torch.cat(anchor) for anchor in anchor_list]
 
         # concatenate each level

From 1a9087b4df2d071032399ecf574870eb92efeb68 Mon Sep 17 00:00:00 2001
From: jbwang1997 <jbwang1997@gmail.com>
Date: Thu, 24 Mar 2022 19:32:34 +0800
Subject: [PATCH 28/42] [Feature] Torchvision high-precision ResNet model
 (#7489)

* Update tnr model

* Update README

* Update README

* Update comments

* Update metafile.yml

* Update model URL
---
 configs/faster_rcnn/README.md                 |  8 ++++++++
 ...aster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py | 17 ++++++++++++++++
 configs/faster_rcnn/metafile.yml              | 20 +++++++++++++++++++
 3 files changed, 45 insertions(+)
 create mode 100644 configs/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py

diff --git a/configs/faster_rcnn/README.md b/configs/faster_rcnn/README.md
index 66eb3e4c072..5fd1cec45dd 100644
--- a/configs/faster_rcnn/README.md
+++ b/configs/faster_rcnn/README.md
@@ -64,6 +64,14 @@ We further finetune some pre-trained models on the COCO subsets, which only cont
 | [R-50-FPN](./faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py)          | caffe | person             | [R-50-FPN-Caffe-3x](./faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py) | 3.7      | 55.8   | [config](./faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929.log.json)                                                 |
 | [R-50-FPN](./faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person-bicycle-car.py) | caffe | person-bicycle-car | [R-50-FPN-Caffe-3x](./faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py) | 3.7      | 44.1   | [config](./faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person-bicycle-car.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person-bicycle-car/faster_rcnn_r50_fpn_1x_coco-person-bicycle-car_20201216_173117-6eda6d92.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person-bicycle-car/faster_rcnn_r50_fpn_1x_coco-person-bicycle-car_20201216_173117.log.json) |
 
+## Torchvision New Receipe (TNR)
+
+Torchvision released its high-precision ResNet models. The training details can be found on the [Pytorch website](https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/). Here, we have done grid searches on learning rate and weight decay and found the optimal hyper-parameter on the detection task.
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download |
+| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: |
+|    [R-50-TNR](./faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py)    |  pytorch  |   1x    | -        |                | 40.2 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco_20220320_085147-efedfda4.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco_20220320_085147-efedfda4.log.json) |
+
 ## Citation
 
 ```latex
diff --git a/configs/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py
new file mode 100644
index 00000000000..ecbfb928d8a
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py
@@ -0,0 +1,17 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+checkpoint = 'https://download.pytorch.org/models/resnet50-11ad3fa6.pth'
+model = dict(
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)))
+
+# `lr` and `weight_decay` have been searched to be optimal.
+optimizer = dict(
+    _delete_=True,
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.1,
+    paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True))
diff --git a/configs/faster_rcnn/metafile.yml b/configs/faster_rcnn/metafile.yml
index 5178a3b47ad..063b0762ab3 100644
--- a/configs/faster_rcnn/metafile.yml
+++ b/configs/faster_rcnn/metafile.yml
@@ -405,3 +405,23 @@ Models:
         Metrics:
           box AP: 43.1
     Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210524_124528-26c63de6.pth
+
+  - Name: faster_rcnn_r50_fpn_tnr-pretrain_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.0
+      inference time (ms/im):
+        - value: 46.73
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.2
+          Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco_20220320_085147-efedfda4.pth

From 0932ab787d58eead15b5f823fbcca5351ceb90f7 Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Fri, 25 Mar 2022 17:13:55 +0800
Subject: [PATCH 29/42] add msdeformattn pixel decoder (#7466)

fix typo

rm img_metas

rename in pixel_decoder

update comments

rename

fix typo

generae points with MlvlPointGenerator
---
 mmdet/models/plugins/__init__.py              |   6 +-
 .../plugins/msdeformattn_pixel_decoder.py     | 269 ++++++++++++++++++
 mmdet/models/plugins/pixel_decoder.py         |  20 +-
 tests/test_models/test_plugins.py             |  60 +++-
 4 files changed, 342 insertions(+), 13 deletions(-)
 create mode 100644 mmdet/models/plugins/msdeformattn_pixel_decoder.py

diff --git a/mmdet/models/plugins/__init__.py b/mmdet/models/plugins/__init__.py
index 940d94e884a..a455c07bb99 100644
--- a/mmdet/models/plugins/__init__.py
+++ b/mmdet/models/plugins/__init__.py
@@ -1,5 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .dropblock import DropBlock
+from .msdeformattn_pixel_decoder import MSDeformAttnPixelDecoder
 from .pixel_decoder import PixelDecoder, TransformerEncoderPixelDecoder
 
-__all__ = ['DropBlock', 'PixelDecoder', 'TransformerEncoderPixelDecoder']
+__all__ = [
+    'DropBlock', 'PixelDecoder', 'TransformerEncoderPixelDecoder',
+    'MSDeformAttnPixelDecoder'
+]
diff --git a/mmdet/models/plugins/msdeformattn_pixel_decoder.py b/mmdet/models/plugins/msdeformattn_pixel_decoder.py
new file mode 100644
index 00000000000..d553582baef
--- /dev/null
+++ b/mmdet/models/plugins/msdeformattn_pixel_decoder.py
@@ -0,0 +1,269 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (PLUGIN_LAYERS, Conv2d, ConvModule, caffe2_xavier_init,
+                      normal_init, xavier_init)
+from mmcv.cnn.bricks.transformer import (build_positional_encoding,
+                                         build_transformer_layer_sequence)
+from mmcv.runner import BaseModule, ModuleList
+
+from mmdet.core.anchor import MlvlPointGenerator
+from mmdet.models.utils.transformer import MultiScaleDeformableAttention
+
+
+@PLUGIN_LAYERS.register_module()
+class MSDeformAttnPixelDecoder(BaseModule):
+    """Pixel decoder with multi-scale deformable attention.
+
+    Args:
+        in_channels (list[int] | tuple[int]): Number of channels in the
+            input feature maps.
+        strides (list[int] | tuple[int]): Output strides of feature from
+            backbone.
+        feat_channels (int): Number of channels for feature.
+        out_channels (int): Number of channels for output.
+        num_outs (int): Number of output scales.
+        norm_cfg (:obj:`mmcv.ConfigDict` | dict): Config for normalization.
+            Defaults to dict(type='GN', num_groups=32).
+        act_cfg (:obj:`mmcv.ConfigDict` | dict): Config for activation.
+            Defaults to dict(type='ReLU').
+        encoder (:obj:`mmcv.ConfigDict` | dict): Config for transformer
+            encoder. Defaults to `DetrTransformerEncoder`.
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer encoder position encoding. Defaults to
+            dict(type='SinePositionalEncoding', num_feats=128,
+            normalize=True).
+        init_cfg (:obj:`mmcv.ConfigDict` | dict): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels=[256, 512, 1024, 2048],
+                 strides=[4, 8, 16, 32],
+                 feat_channels=256,
+                 out_channels=256,
+                 num_outs=3,
+                 norm_cfg=dict(type='GN', num_groups=32),
+                 act_cfg=dict(type='ReLU'),
+                 encoder=dict(
+                     type='DetrTransformerEncoder',
+                     num_layers=6,
+                     transformerlayers=dict(
+                         type='BaseTransformerLayer',
+                         attn_cfgs=dict(
+                             type='MultiScaleDeformableAttention',
+                             embed_dims=256,
+                             num_heads=8,
+                             num_levels=3,
+                             num_points=4,
+                             im2col_step=64,
+                             dropout=0.0,
+                             batch_first=False,
+                             norm_cfg=None,
+                             init_cfg=None),
+                         feedforward_channels=1024,
+                         ffn_dropout=0.0,
+                         operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                     init_cfg=None),
+                 positional_encoding=dict(
+                     type='SinePositionalEncoding',
+                     num_feats=128,
+                     normalize=True),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.strides = strides
+        self.num_input_levels = len(in_channels)
+        self.num_encoder_levels = \
+            encoder.transformerlayers.attn_cfgs.num_levels
+        assert self.num_encoder_levels >= 1, \
+            'num_levels in attn_cfgs must be at least one'
+        input_conv_list = []
+        # from top to down (low to high resolution)
+        for i in range(self.num_input_levels - 1,
+                       self.num_input_levels - self.num_encoder_levels - 1,
+                       -1):
+            input_conv = ConvModule(
+                in_channels[i],
+                feat_channels,
+                kernel_size=1,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                bias=True)
+            input_conv_list.append(input_conv)
+        self.input_convs = ModuleList(input_conv_list)
+
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.postional_encoding = build_positional_encoding(
+            positional_encoding)
+        # high resolution to low resolution
+        self.level_encoding = nn.Embedding(self.num_encoder_levels,
+                                           feat_channels)
+
+        # fpn-like structure
+        self.lateral_convs = ModuleList()
+        self.output_convs = ModuleList()
+        self.use_bias = norm_cfg is None
+        # from top to down (low to high resolution)
+        # fpn for the rest features that didn't pass in encoder
+        for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1,
+                       -1):
+            lateral_conv = ConvModule(
+                in_channels[i],
+                feat_channels,
+                kernel_size=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+            output_conv = ConvModule(
+                feat_channels,
+                feat_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.lateral_convs.append(lateral_conv)
+            self.output_convs.append(output_conv)
+
+        self.mask_feature = Conv2d(
+            feat_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+        self.num_outs = num_outs
+        self.point_generator = MlvlPointGenerator(strides)
+
+    def init_weights(self):
+        """Initialize weights."""
+        for i in range(0, self.num_encoder_levels):
+            xavier_init(
+                self.input_convs[i].conv,
+                gain=1,
+                bias=0,
+                distribution='uniform')
+
+        for i in range(0, self.num_input_levels - self.num_encoder_levels):
+            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
+            caffe2_xavier_init(self.output_convs[i].conv, bias=0)
+
+        caffe2_xavier_init(self.mask_feature, bias=0)
+
+        normal_init(self.level_encoding, mean=0, std=1)
+        for p in self.encoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+        # init_weights defined in MultiScaleDeformableAttention
+        for layer in self.encoder.layers:
+            for attn in layer.attentions:
+                if isinstance(attn, MultiScaleDeformableAttention):
+                    attn.init_weights()
+
+    def forward(self, feats):
+        """
+        Args:
+            feats (list[Tensor]): Feature maps of each level. Each has
+                shape of (batch_size, c, h, w).
+
+        Returns:
+            tuple: A tuple containing the following:
+
+            - mask_feature (Tensor): shape (batch_size, c, h, w).
+            - multi_scale_features (list[Tensor]): Multi scale \
+                    features, each in shape (batch_size, c, h, w).
+        """
+        # generate padding mask for each level, for each image
+        batch_size = feats[0].shape[0]
+        encoder_input_list = []
+        padding_mask_list = []
+        level_positional_encoding_list = []
+        spatial_shapes = []
+        reference_points_list = []
+        for i in range(self.num_encoder_levels):
+            level_idx = self.num_input_levels - i - 1
+            feat = feats[level_idx]
+            feat_projected = self.input_convs[i](feat)
+            h, w = feat.shape[-2:]
+
+            # no padding
+            padding_mask_resized = feat.new_zeros(
+                (batch_size, ) + feat.shape[-2:], dtype=torch.bool)
+            pos_embed = self.postional_encoding(padding_mask_resized)
+            level_embed = self.level_encoding.weight[i]
+            level_pos_embed = level_embed.view(1, -1, 1, 1) + pos_embed
+            # (h_i * w_i, 2)
+            reference_points = self.point_generator.single_level_grid_priors(
+                feat.shape[-2:], level_idx, device=feat.device)
+            # normalize
+            factor = feat.new_tensor([[w, h]]) * self.strides[level_idx]
+            reference_points = reference_points / factor
+
+            # shape (batch_size, c, h_i, w_i) -> (h_i * w_i, batch_size, c)
+            feat_projected = feat_projected.flatten(2).permute(2, 0, 1)
+            level_pos_embed = level_pos_embed.flatten(2).permute(2, 0, 1)
+            padding_mask_resized = padding_mask_resized.flatten(1)
+
+            encoder_input_list.append(feat_projected)
+            padding_mask_list.append(padding_mask_resized)
+            level_positional_encoding_list.append(level_pos_embed)
+            spatial_shapes.append(feat.shape[-2:])
+            reference_points_list.append(reference_points)
+        # shape (batch_size, total_num_query),
+        # total_num_query=sum([., h_i * w_i,.])
+        padding_masks = torch.cat(padding_mask_list, dim=1)
+        # shape (total_num_query, batch_size, c)
+        encoder_inputs = torch.cat(encoder_input_list, dim=0)
+        level_positional_encodings = torch.cat(
+            level_positional_encoding_list, dim=0)
+        device = encoder_inputs.device
+        # shape (num_encoder_levels, 2), from low
+        # resolution to high resolution
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=device)
+        # shape (0, h_0*w_0, h_0*w_0+h_1*w_1, ...)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        reference_points = torch.cat(reference_points_list, dim=0)
+        reference_points = reference_points[None, :, None].repeat(
+            batch_size, 1, self.num_encoder_levels, 1)
+        valid_radios = reference_points.new_ones(
+            (batch_size, self.num_encoder_levels, 2))
+        # shape (num_total_query, batch_size, c)
+        memory = self.encoder(
+            query=encoder_inputs,
+            key=None,
+            value=None,
+            query_pos=level_positional_encodings,
+            key_pos=None,
+            attn_masks=None,
+            key_padding_mask=None,
+            query_key_padding_mask=padding_masks,
+            spatial_shapes=spatial_shapes,
+            reference_points=reference_points,
+            level_start_index=level_start_index,
+            valid_radios=valid_radios)
+        # (num_total_query, batch_size, c) -> (batch_size, c, num_total_query)
+        memory = memory.permute(1, 2, 0)
+
+        # from low resolution to high resolution
+        num_query_per_level = [e[0] * e[1] for e in spatial_shapes]
+        outs = torch.split(memory, num_query_per_level, dim=-1)
+        outs = [
+            x.reshape(batch_size, -1, spatial_shapes[i][0],
+                      spatial_shapes[i][1]) for i, x in enumerate(outs)
+        ]
+
+        for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1,
+                       -1):
+            x = feats[i]
+            cur_feat = self.lateral_convs[i](x)
+            y = cur_feat + F.interpolate(
+                outs[-1],
+                size=cur_feat.shape[-2:],
+                mode='bilinear',
+                align_corners=False)
+            y = self.output_convs[i](y)
+            outs.append(y)
+        multi_scale_features = outs[:self.num_outs]
+
+        mask_feature = self.mask_feature(outs[-1])
+        return mask_feature, multi_scale_features
diff --git a/mmdet/models/plugins/pixel_decoder.py b/mmdet/models/plugins/pixel_decoder.py
index d1193551ddd..537a187dc5c 100644
--- a/mmdet/models/plugins/pixel_decoder.py
+++ b/mmdet/models/plugins/pixel_decoder.py
@@ -45,14 +45,14 @@ def __init__(self,
         self.output_convs = ModuleList()
         self.use_bias = norm_cfg is None
         for i in range(0, self.num_inputs - 1):
-            l_conv = ConvModule(
+            lateral_conv = ConvModule(
                 in_channels[i],
                 feat_channels,
                 kernel_size=1,
                 bias=self.use_bias,
                 norm_cfg=norm_cfg,
                 act_cfg=None)
-            o_conv = ConvModule(
+            output_conv = ConvModule(
                 feat_channels,
                 feat_channels,
                 kernel_size=3,
@@ -61,8 +61,8 @@ def __init__(self,
                 bias=self.use_bias,
                 norm_cfg=norm_cfg,
                 act_cfg=act_cfg)
-            self.lateral_convs.append(l_conv)
-            self.output_convs.append(o_conv)
+            self.lateral_convs.append(lateral_conv)
+            self.output_convs.append(output_conv)
 
         self.last_feat_conv = ConvModule(
             in_channels[-1],
@@ -102,9 +102,9 @@ def forward(self, feats, img_metas):
         y = self.last_feat_conv(feats[-1])
         for i in range(self.num_inputs - 2, -1, -1):
             x = feats[i]
-            cur_fpn = self.lateral_convs[i](x)
-            y = cur_fpn + \
-                F.interpolate(y, size=cur_fpn.shape[-2:], mode='nearest')
+            cur_feat = self.lateral_convs[i](x)
+            y = cur_feat + \
+                F.interpolate(y, size=cur_feat.shape[-2:], mode='nearest')
             y = self.output_convs[i](y)
 
         mask_feature = self.mask_feature(y)
@@ -234,9 +234,9 @@ def forward(self, feats, img_metas):
         y = self.encoder_out_proj(memory)
         for i in range(self.num_inputs - 2, -1, -1):
             x = feats[i]
-            cur_fpn = self.lateral_convs[i](x)
-            y = cur_fpn + \
-                F.interpolate(y, size=cur_fpn.shape[-2:], mode='nearest')
+            cur_feat = self.lateral_convs[i](x)
+            y = cur_feat + \
+                F.interpolate(y, size=cur_feat.shape[-2:], mode='nearest')
             y = self.output_convs[i](y)
 
         mask_feature = self.mask_feature(y)
diff --git a/tests/test_models/test_plugins.py b/tests/test_models/test_plugins.py
index b115fbd73f2..8afd1f9403a 100644
--- a/tests/test_models/test_plugins.py
+++ b/tests/test_models/test_plugins.py
@@ -31,7 +31,7 @@ def test_dropblock():
         DropBlock(0.5, 3, -1)
 
 
-def test_pixeldecoder():
+def test_pixel_decoder():
     base_channels = 64
     pixel_decoder_cfg = ConfigDict(
         dict(
@@ -53,7 +53,7 @@ def test_pixeldecoder():
     assert mask_feature.shape == feats[0].shape
 
 
-def test_transformerencoderpixeldecoer():
+def test_transformer_encoder_pixel_decoder():
     base_channels = 64
     pixel_decoder_cfg = ConfigDict(
         dict(
@@ -109,3 +109,59 @@ def test_transformerencoderpixeldecoer():
 
     assert memory.shape[-2:] == feats[-1].shape[-2:]
     assert mask_feature.shape == feats[0].shape
+
+
+def test_msdeformattn_pixel_decoder():
+    base_channels = 64
+    pixel_decoder_cfg = ConfigDict(
+        dict(
+            type='MSDeformAttnPixelDecoder',
+            in_channels=[base_channels * 2**i for i in range(4)],
+            strides=[4, 8, 16, 32],
+            feat_channels=base_channels,
+            out_channels=base_channels,
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=base_channels,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=False,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=base_channels,
+                        feedforward_channels=base_channels * 4,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='SinePositionalEncoding',
+                num_feats=base_channels // 2,
+                normalize=True),
+            init_cfg=None), )
+    self = build_plugin_layer(pixel_decoder_cfg)[1]
+    feats = [
+        torch.rand((2, base_channels * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i)))
+        for i in range(4)
+    ]
+    mask_feature, multi_scale_features = self(feats)
+
+    assert mask_feature.shape == feats[0].shape
+    assert len(multi_scale_features) == 3
+    multi_scale_features = multi_scale_features[::-1]
+    for i in range(3):
+        assert multi_scale_features[i].shape[-2:] == feats[i + 1].shape[-2:]

From 4bb184bae070f37febb10f82bee3a217dc1ad7c5 Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Fri, 25 Mar 2022 17:34:56 +0800
Subject: [PATCH 30/42] [Enhance] MaskFormer refactor (#7471)

* maskformer refactor

update docstring

update docstring

update unit test

update unit test

update unit test

* remove redundant code

* update unit test
---
 .../maskformer_r50_mstrain_16x1_75e_coco.py   |  32 ++-
 mmdet/core/mask/__init__.py                   |   4 +-
 mmdet/core/mask/utils.py                      |  25 ++
 mmdet/models/dense_heads/maskformer_head.py   | 149 ++---------
 mmdet/models/detectors/maskformer.py          |  70 ++++-
 .../panoptic_fusion_heads/__init__.py         |   1 +
 .../maskformer_fusion_head.py                 | 241 ++++++++++++++++++
 .../test_dense_heads/test_maskformer_head.py  |  11 +-
 .../test_maskformer_fusion_head.py            |  53 ++++
 tests/test_utils/test_masks.py                |  26 +-
 10 files changed, 454 insertions(+), 158 deletions(-)
 create mode 100644 mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py
 create mode 100644 tests/test_models/test_seg_heads/test_maskformer_fusion_head.py

diff --git a/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py b/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py
index c9d92450570..46b3c135dd8 100644
--- a/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py
+++ b/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py
@@ -1,7 +1,9 @@
 _base_ = [
     '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py'
 ]
-
+num_things_classes = 80
+num_stuff_classes = 53
+num_classes = num_things_classes + num_stuff_classes
 model = dict(
     type='MaskFormer',
     backbone=dict(
@@ -19,8 +21,8 @@
         in_channels=[256, 512, 1024, 2048],  # pass to pixel_decoder inside
         feat_channels=256,
         out_channels=256,
-        num_things_classes=80,
-        num_stuff_classes=53,
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
         num_queries=100,
         pixel_decoder=dict(
             type='TransformerEncoderPixelDecoder',
@@ -87,11 +89,10 @@
             init_cfg=None),
         loss_cls=dict(
             type='CrossEntropyLoss',
-            bg_cls_weight=0.1,
             use_sigmoid=False,
             loss_weight=1.0,
             reduction='mean',
-            class_weight=1.0),
+            class_weight=[1.0] * num_classes + [0.1]),
         loss_mask=dict(
             type='FocalLoss',
             use_sigmoid=True,
@@ -107,6 +108,12 @@
             naive_dice=True,
             eps=1.0,
             loss_weight=1.0)),
+    panoptic_fusion_head=dict(
+        type='MaskFormerFusionHead',
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        loss_panoptic=None,
+        init_cfg=None),
     train_cfg=dict(
         assigner=dict(
             type='MaskHungarianAssigner',
@@ -116,8 +123,19 @@
             dice_cost=dict(
                 type='DiceCost', weight=1.0, pred_act=True, eps=1.0)),
         sampler=dict(type='MaskPseudoSampler')),
-    test_cfg=dict(object_mask_thr=0.8, iou_thr=0.8),
-    # pretrained=None,
+    test_cfg=dict(
+        panoptic_on=True,
+        # For now, the dataset does not support
+        # evaluating semantic segmentation metric.
+        semantic_on=False,
+        instance_on=False,
+        # max_per_image is for instance segmentation.
+        max_per_image=100,
+        object_mask_thr=0.8,
+        iou_thr=0.8,
+        # In MaskFormer's panoptic postprocessing,
+        # it will not filter masks whose score is smaller than 0.5 .
+        filter_low_score=False),
     init_cfg=None)
 
 # dataset settings
diff --git a/mmdet/core/mask/__init__.py b/mmdet/core/mask/__init__.py
index 2083af20251..644a9b1d9b4 100644
--- a/mmdet/core/mask/__init__.py
+++ b/mmdet/core/mask/__init__.py
@@ -1,9 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .mask_target import mask_target
 from .structures import BaseInstanceMasks, BitmapMasks, PolygonMasks
-from .utils import encode_mask_results, split_combined_polys
+from .utils import encode_mask_results, mask2bbox, split_combined_polys
 
 __all__ = [
     'split_combined_polys', 'mask_target', 'BaseInstanceMasks', 'BitmapMasks',
-    'PolygonMasks', 'encode_mask_results'
+    'PolygonMasks', 'encode_mask_results', 'mask2bbox'
 ]
diff --git a/mmdet/core/mask/utils.py b/mmdet/core/mask/utils.py
index 8e95f72b364..90544b34f49 100644
--- a/mmdet/core/mask/utils.py
+++ b/mmdet/core/mask/utils.py
@@ -2,6 +2,7 @@
 import mmcv
 import numpy as np
 import pycocotools.mask as mask_util
+import torch
 
 
 def split_combined_polys(polys, poly_lens, polys_per_mask):
@@ -62,3 +63,27 @@ def encode_mask_results(mask_results):
         return encoded_mask_results, cls_mask_scores
     else:
         return encoded_mask_results
+
+
+def mask2bbox(masks):
+    """Obtain tight bounding boxes of binary masks.
+
+    Args:
+        masks (Tensor): Binary mask of shape (n, h, w).
+
+    Returns:
+        Tensor: Bboxe with shape (n, 4) of \
+            positive region in binary mask.
+    """
+    N = masks.shape[0]
+    bboxes = masks.new_zeros((N, 4), dtype=torch.float32)
+    x_any = torch.any(masks, dim=1)
+    y_any = torch.any(masks, dim=2)
+    for i in range(N):
+        x = torch.where(x_any[i, :])[0]
+        y = torch.where(y_any[i, :])[0]
+        if len(x) > 0 and len(y) > 0:
+            bboxes[i, :] = bboxes.new_tensor(
+                [x[0], y[0], x[-1] + 1, y[-1] + 1])
+
+    return bboxes
diff --git a/mmdet/models/dense_heads/maskformer_head.py b/mmdet/models/dense_heads/maskformer_head.py
index 7d7a644c7e1..4541e018c0d 100644
--- a/mmdet/models/dense_heads/maskformer_head.py
+++ b/mmdet/models/dense_heads/maskformer_head.py
@@ -8,7 +8,6 @@
 from mmcv.runner import force_fp32
 
 from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean
-from mmdet.core.evaluation import INSTANCE_OFFSET
 from mmdet.models.utils import preprocess_panoptic_gt
 from ..builder import HEADS, build_loss
 from .anchor_free_head import AnchorFreeHead
@@ -64,10 +63,9 @@ def __init__(self,
                  positional_encoding=None,
                  loss_cls=dict(
                      type='CrossEntropyLoss',
-                     bg_cls_weight=0.1,
                      use_sigmoid=False,
                      loss_weight=1.0,
-                     class_weight=1.0),
+                     class_weight=[1.0] * 133 + [0.1]),
                  loss_mask=dict(
                      type='FocalLoss',
                      use_sigmoid=True,
@@ -118,32 +116,10 @@ def __init__(self,
         self.test_cfg = test_cfg
         self.train_cfg = train_cfg
         if train_cfg:
-            assert 'assigner' in train_cfg, 'assigner should be provided '\
-                'when train_cfg is set.'
-            assigner = train_cfg['assigner']
-            self.assigner = build_assigner(assigner)
-            sampler_cfg = dict(type='MaskPseudoSampler')
-            self.sampler = build_sampler(sampler_cfg, context=self)
-
-        self.bg_cls_weight = 0
-        class_weight = loss_cls.get('class_weight', None)
-        if class_weight is not None and (self.__class__ is MaskFormerHead):
-            assert isinstance(class_weight, float), 'Expected ' \
-                'class_weight to have type float. Found ' \
-                f'{type(class_weight)}.'
-            # NOTE following the official MaskFormerHead repo, bg_cls_weight
-            # means relative classification weight of the VOID class.
-            bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
-            assert isinstance(bg_cls_weight, float), 'Expected ' \
-                'bg_cls_weight to have type float. Found ' \
-                f'{type(bg_cls_weight)}.'
-            class_weight = torch.ones(self.num_classes + 1) * class_weight
-            # set VOID class as the last indice
-            class_weight[self.num_classes] = bg_cls_weight
-            loss_cls.update({'class_weight': class_weight})
-            if 'bg_cls_weight' in loss_cls:
-                loss_cls.pop('bg_cls_weight')
-            self.bg_cls_weight = bg_cls_weight
+            self.assigner = build_assigner(train_cfg.assigner)
+            self.sampler = build_sampler(train_cfg.sampler, context=self)
+
+        self.class_weight = loss_cls.class_weight
         self.loss_cls = build_loss(loss_cls)
         self.loss_mask = build_loss(loss_mask)
         self.loss_dice = build_loss(loss_dice)
@@ -304,7 +280,8 @@ def loss(self, all_cls_scores, all_mask_preds, gt_labels_list,
         Args:
             all_cls_scores (Tensor): Classification scores for all decoder
                 layers with shape (num_decoder, batch_size, num_queries,
-                cls_out_channels).
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
             all_mask_preds (Tensor): Mask scores for all decoder layers with
                 shape (num_decoder, batch_size, num_queries, h, w).
             gt_labels_list (list[Tensor]): Ground truth class indices for each
@@ -347,7 +324,8 @@ def loss_single(self, cls_scores, mask_preds, gt_labels_list,
         Args:
             cls_scores (Tensor): Mask score logits from a single decoder layer
                 for all images. Shape (batch_size, num_queries,
-                cls_out_channels).
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
             mask_preds (Tensor): Mask logits for a pixel decoder for all
                 images. Shape (batch_size, num_queries, h, w).
             gt_labels_list (list[Tensor]): Ground truth class indices for each
@@ -385,8 +363,7 @@ def loss_single(self, cls_scores, mask_preds, gt_labels_list,
         labels = labels.flatten(0, 1)
         label_weights = label_weights.flatten(0, 1)
 
-        class_weight = cls_scores.new_ones(self.num_classes + 1)
-        class_weight[-1] = self.bg_cls_weight
+        class_weight = cls_scores.new_tensor(self.class_weight)
         loss_cls = self.loss_cls(
             cls_scores,
             labels,
@@ -544,30 +521,22 @@ def forward_train(self,
 
         return losses
 
-    def simple_test(self, feats, img_metas, rescale=False):
-        """Test segment without test-time aumengtation.
-
-        Only the output of last decoder layers was used.
+    def simple_test(self, feats, img_metas, **kwargs):
+        """Test without augmentaton.
 
         Args:
             feats (list[Tensor]): Multi-level features from the
                 upstream network, each is a 4D-tensor.
             img_metas (list[dict]): List of image information.
-            rescale (bool, optional):  If True, return boxes in
-                original image space. Default False.
 
         Returns:
-            list[dict[str, np.array]]: semantic segmentation results\
-                and panoptic segmentation results for each image.
+            tuple: A tuple contains two tensors.
 
-            .. code-block:: none
-
-                [
-                    {
-                        'pan_results': <np.ndarray>, # shape = [h, w]
-                    },
-                    ...
-                ]
+            - mask_cls_results (Tensor): Mask classification logits,\
+                shape (batch_size, num_queries, cls_out_channels).
+                Note `cls_out_channels` should includes background.
+            - mask_pred_results (Tensor): Mask logits, shape \
+                (batch_size, num_queries, h, w).
         """
         all_cls_scores, all_mask_preds = self(feats, img_metas)
         mask_cls_results = all_cls_scores[-1]
@@ -581,84 +550,4 @@ def simple_test(self, feats, img_metas, rescale=False):
             mode='bilinear',
             align_corners=False)
 
-        results = []
-        for mask_cls_result, mask_pred_result, meta in zip(
-                mask_cls_results, mask_pred_results, img_metas):
-            # remove padding
-            img_height, img_width = meta['img_shape'][:2]
-            mask_pred_result = mask_pred_result[:, :img_height, :img_width]
-
-            if rescale:
-                # return result in original resolution
-                ori_height, ori_width = meta['ori_shape'][:2]
-                mask_pred_result = F.interpolate(mask_pred_result.unsqueeze(1),
-                                                 size=(ori_height, ori_width),
-                                                 mode='bilinear',
-                                                 align_corners=False)\
-                    .squeeze(1)
-
-            mask = self.post_process(mask_cls_result, mask_pred_result)
-            results.append(mask)
-
-        return results
-
-    def post_process(self, mask_cls, mask_pred):
-        """Panoptic segmengation inference.
-
-        This implementation is modified from `MaskFormer
-        <https://github.com/facebookresearch/MaskFormer>`_.
-
-        Args:
-            mask_cls (Tensor): Classfication outputs for a image.
-                shape = (num_queries, cls_out_channels).
-            mask_pred (Tensor): Mask outputs for a image.
-                shape = (num_queries, h, w).
-
-        Returns:
-            Tensor: panoptic segment result of shape (h, w),\
-                each element in Tensor means:
-                segment_id = _cls + instance_id * INSTANCE_OFFSET.
-        """
-        object_mask_thr = self.test_cfg.get('object_mask_thr', 0.8)
-        iou_thr = self.test_cfg.get('iou_thr', 0.8)
-
-        scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
-        mask_pred = mask_pred.sigmoid()
-
-        keep = labels.ne(self.num_classes) & (scores > object_mask_thr)
-        cur_scores = scores[keep]
-        cur_classes = labels[keep]
-        cur_masks = mask_pred[keep]
-
-        cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
-
-        h, w = cur_masks.shape[-2:]
-        panoptic_seg = torch.full((h, w),
-                                  self.num_classes,
-                                  dtype=torch.int32,
-                                  device=cur_masks.device)
-        if cur_masks.shape[0] == 0:
-            # We didn't detect any mask :(
-            pass
-        else:
-            cur_mask_ids = cur_prob_masks.argmax(0)
-            instance_id = 1
-            for k in range(cur_classes.shape[0]):
-                pred_class = int(cur_classes[k].item())
-                isthing = pred_class < self.num_things_classes
-                mask = cur_mask_ids == k
-                mask_area = mask.sum().item()
-                original_area = (cur_masks[k] >= 0.5).sum().item()
-                if mask_area > 0 and original_area > 0:
-                    if mask_area / original_area < iou_thr:
-                        continue
-
-                    if not isthing:
-                        # different stuff regions of same class will be
-                        # merged here, and stuff share the instance_id 0.
-                        panoptic_seg[mask] = pred_class
-                    else:
-                        panoptic_seg[mask] = (
-                            pred_class + instance_id * INSTANCE_OFFSET)
-                        instance_id += 1
-        return panoptic_seg
+        return mask_cls_results, mask_pred_results
diff --git a/mmdet/models/detectors/maskformer.py b/mmdet/models/detectors/maskformer.py
index f7257d2547d..b626e070813 100644
--- a/mmdet/models/detectors/maskformer.py
+++ b/mmdet/models/detectors/maskformer.py
@@ -2,7 +2,7 @@
 import mmcv
 import numpy as np
 
-from mmdet.core import INSTANCE_OFFSET
+from mmdet.core import INSTANCE_OFFSET, bbox2result
 from mmdet.core.visualization import imshow_det_bboxes
 from ..builder import DETECTORS, build_backbone, build_head, build_neck
 from .single_stage import SingleStageDetector
@@ -18,6 +18,7 @@ def __init__(self,
                  backbone,
                  neck=None,
                  panoptic_head=None,
+                 panoptic_fusion_head=None,
                  train_cfg=None,
                  test_cfg=None,
                  init_cfg=None):
@@ -25,9 +26,15 @@ def __init__(self,
         self.backbone = build_backbone(backbone)
         if neck is not None:
             self.neck = build_neck(neck)
-        panoptic_head.update(train_cfg=train_cfg)
-        panoptic_head.update(test_cfg=test_cfg)
-        self.panoptic_head = build_head(panoptic_head)
+
+        panoptic_head_ = panoptic_head.deepcopy()
+        panoptic_head_.update(train_cfg=train_cfg)
+        panoptic_head_.update(test_cfg=test_cfg)
+        self.panoptic_head = build_head(panoptic_head_)
+
+        panoptic_fusion_head_ = panoptic_fusion_head.deepcopy()
+        panoptic_fusion_head_.update(test_cfg=test_cfg)
+        self.panoptic_fusion_head = build_head(panoptic_fusion_head_)
 
         self.num_things_classes = self.panoptic_head.num_things_classes
         self.num_stuff_classes = self.panoptic_head.num_stuff_classes
@@ -96,16 +103,53 @@ def forward_train(self,
 
         return losses
 
-    def simple_test(self, img, img_metas, **kwargs):
-        """Test without augmentation."""
-        feat = self.extract_feat(img)
-        mask_results = self.panoptic_head.simple_test(feat, img_metas,
-                                                      **kwargs)
+    def simple_test(self, imgs, img_metas, **kwargs):
+        """Test without augmentation.
+
+        Args:
+            imgs (Tensor): A batch of images.
+            img_metas (list[dict]): List of image information.
 
-        results = []
-        for mask in mask_results:
-            result = {'pan_results': mask.detach().cpu().numpy()}
-            results.append(result)
+        Returns:
+            list[dict[str, np.array | tuple]]: Semantic segmentation \
+                results and panoptic segmentation results for each \
+                image.
+
+            .. code-block:: none
+
+                [
+                    {
+                        'pan_results': np.array, # shape = [h, w]
+                        'ins_results': tuple[list],
+                        # semantic segmentation results are not supported yet
+                        'sem_results': np.array
+                    },
+                    ...
+                ]
+        """
+        feats = self.extract_feat(imgs)
+        mask_cls_results, mask_pred_results = self.panoptic_head.simple_test(
+            feats, img_metas, **kwargs)
+        results = self.panoptic_fusion_head.simple_test(
+            mask_cls_results, mask_pred_results, img_metas, **kwargs)
+        for i in range(len(results)):
+            if 'pan_results' in results[i]:
+                results[i]['pan_results'] = results[i]['pan_results'].detach(
+                ).cpu().numpy()
+
+            if 'ins_results' in results[i]:
+                labels_per_image, bboxes, mask_pred_binary = results[i][
+                    'ins_results']
+                bbox_results = bbox2result(bboxes, labels_per_image,
+                                           self.num_things_classes)
+                mask_results = [[] for _ in range(self.num_things_classes)]
+                for j, label in enumerate(labels_per_image):
+                    mask = mask_pred_binary[j].detach().cpu().numpy()
+                    mask_results[label].append(mask)
+                results[i]['ins_results'] = bbox_results, mask_results
+
+            assert 'sem_results' not in results[i], 'segmantic segmentation '\
+                'results are not supported yet.'
 
         return results
 
diff --git a/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py b/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py
index d14a33c317a..41625a61d6d 100644
--- a/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py
+++ b/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py
@@ -2,3 +2,4 @@
 from .base_panoptic_fusion_head import \
     BasePanopticFusionHead  # noqa: F401,F403
 from .heuristic_fusion_head import HeuristicFusionHead  # noqa: F401,F403
+from .maskformer_fusion_head import MaskFormerFusionHead  # noqa: F401,F403
diff --git a/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py b/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py
new file mode 100644
index 00000000000..5b59ce4deae
--- /dev/null
+++ b/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py
@@ -0,0 +1,241 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+
+from mmdet.core.evaluation.panoptic_utils import INSTANCE_OFFSET
+from mmdet.core.mask import mask2bbox
+from mmdet.models.builder import HEADS
+from .base_panoptic_fusion_head import BasePanopticFusionHead
+
+
+@HEADS.register_module()
+class MaskFormerFusionHead(BasePanopticFusionHead):
+
+    def __init__(self,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 test_cfg=None,
+                 loss_panoptic=None,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(num_things_classes, num_stuff_classes, test_cfg,
+                         loss_panoptic, init_cfg, **kwargs)
+
+    def forward_train(self, **kwargs):
+        """MaskFormerFusionHead has no training loss."""
+        return dict()
+
+    def panoptic_postprocess(self, mask_cls, mask_pred):
+        """Panoptic segmengation inference.
+
+        Args:
+            mask_cls (Tensor): Classfication outputs of shape
+                (num_queries, cls_out_channels) for a image.
+                Note `cls_out_channels` should includes
+                background.
+            mask_pred (Tensor): Mask outputs of shape
+                (num_queries, h, w) for a image.
+
+        Returns:
+            Tensor: Panoptic segment result of shape \
+                (h, w), each element in Tensor means: \
+                ``segment_id = _cls + instance_id * INSTANCE_OFFSET``.
+        """
+        object_mask_thr = self.test_cfg.get('object_mask_thr', 0.8)
+        iou_thr = self.test_cfg.get('iou_thr', 0.8)
+        filter_low_score = self.test_cfg.get('filter_low_score', False)
+
+        scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
+        mask_pred = mask_pred.sigmoid()
+
+        keep = labels.ne(self.num_classes) & (scores > object_mask_thr)
+        cur_scores = scores[keep]
+        cur_classes = labels[keep]
+        cur_masks = mask_pred[keep]
+
+        cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
+
+        h, w = cur_masks.shape[-2:]
+        panoptic_seg = torch.full((h, w),
+                                  self.num_classes,
+                                  dtype=torch.int32,
+                                  device=cur_masks.device)
+        if cur_masks.shape[0] == 0:
+            # We didn't detect any mask :(
+            pass
+        else:
+            cur_mask_ids = cur_prob_masks.argmax(0)
+            instance_id = 1
+            for k in range(cur_classes.shape[0]):
+                pred_class = int(cur_classes[k].item())
+                isthing = pred_class < self.num_things_classes
+                mask = cur_mask_ids == k
+                mask_area = mask.sum().item()
+                original_area = (cur_masks[k] >= 0.5).sum().item()
+
+                if filter_low_score:
+                    mask = mask & (cur_masks[k] >= 0.5)
+
+                if mask_area > 0 and original_area > 0:
+                    if mask_area / original_area < iou_thr:
+                        continue
+
+                    if not isthing:
+                        # different stuff regions of same class will be
+                        # merged here, and stuff share the instance_id 0.
+                        panoptic_seg[mask] = pred_class
+                    else:
+                        panoptic_seg[mask] = (
+                            pred_class + instance_id * INSTANCE_OFFSET)
+                        instance_id += 1
+
+        return panoptic_seg
+
+    def semantic_postprocess(self, mask_cls, mask_pred):
+        """Semantic segmengation postprocess.
+
+        Args:
+            mask_cls (Tensor): Classfication outputs of shape
+                (num_queries, cls_out_channels) for a image.
+                Note `cls_out_channels` should includes
+                background.
+            mask_pred (Tensor): Mask outputs of shape
+                (num_queries, h, w) for a image.
+
+        Returns:
+            Tensor: Semantic segment result of shape \
+                (cls_out_channels, h, w).
+        """
+        # TODO add semantic segmentation result
+        raise NotImplementedError
+
+    def instance_postprocess(self, mask_cls, mask_pred):
+        """Instance segmengation postprocess.
+
+        Args:
+            mask_cls (Tensor): Classfication outputs of shape
+                (num_queries, cls_out_channels) for a image.
+                Note `cls_out_channels` should includes
+                background.
+            mask_pred (Tensor): Mask outputs of shape
+                (num_queries, h, w) for a image.
+
+        Returns:
+            tuple[Tensor]: Instance segmentation results.
+
+            - labels_per_image (Tensor): Predicted labels,\
+                shape (n, ).
+            - bboxes (Tensor): Bboxes and scores with shape (n, 5) of \
+                positive region in binary mask, the last column is scores.
+            - mask_pred_binary (Tensor): Instance masks of \
+                shape (n, h, w).
+        """
+        max_per_image = self.test_cfg.get('max_per_image', 100)
+        num_queries = mask_cls.shape[0]
+        # shape (num_queries, num_class)
+        scores = F.softmax(mask_cls, dim=-1)[:, :-1]
+        # shape (num_queries * num_class, )
+        labels = torch.arange(self.num_classes, device=mask_cls.device).\
+            unsqueeze(0).repeat(num_queries, 1).flatten(0, 1)
+        scores_per_image, top_indices = scores.flatten(0, 1).topk(
+            max_per_image, sorted=False)
+        labels_per_image = labels[top_indices]
+
+        query_indices = top_indices // self.num_classes
+        mask_pred = mask_pred[query_indices]
+
+        # extract things
+        is_thing = labels_per_image < self.num_things_classes
+        scores_per_image = scores_per_image[is_thing]
+        labels_per_image = labels_per_image[is_thing]
+        mask_pred = mask_pred[is_thing]
+
+        mask_pred_binary = (mask_pred > 0).float()
+        mask_scores_per_image = (mask_pred.sigmoid() *
+                                 mask_pred_binary).flatten(1).sum(1) / (
+                                     mask_pred_binary.flatten(1).sum(1) + 1e-6)
+        det_scores = scores_per_image * mask_scores_per_image
+        mask_pred_binary = mask_pred_binary.bool()
+        bboxes = mask2bbox(mask_pred_binary)
+        bboxes = torch.cat([bboxes, det_scores[:, None]], dim=-1)
+
+        return labels_per_image, bboxes, mask_pred_binary
+
+    def simple_test(self,
+                    mask_cls_results,
+                    mask_pred_results,
+                    img_metas,
+                    rescale=False,
+                    **kwargs):
+        """Test segment without test-time aumengtation.
+
+        Only the output of last decoder layers was used.
+
+        Args:
+            mask_cls_results (Tensor): Mask classification logits,
+                shape (batch_size, num_queries, cls_out_channels).
+                Note `cls_out_channels` should includes background.
+            mask_pred_results (Tensor): Mask logits, shape
+                (batch_size, num_queries, h, w).
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): If True, return boxes in
+                original image space. Default False.
+
+        Returns:
+            list[dict[str, Tensor | tuple[Tensor]]]: Semantic segmentation \
+                results and panoptic segmentation results for each \
+                image.
+
+            .. code-block:: none
+
+                [
+                    {
+                        'pan_results': Tensor, # shape = [h, w]
+                        'ins_results': tuple[Tensor],
+                        # semantic segmentation results are not supported yet
+                        'sem_results': Tensor
+                    },
+                    ...
+                ]
+        """
+        panoptic_on = self.test_cfg.get('panoptic_on', True)
+        semantic_on = self.test_cfg.get('semantic_on', False)
+        instance_on = self.test_cfg.get('instance_on', False)
+        assert not semantic_on, 'segmantic segmentation '\
+            'results are not supported yet.'
+
+        results = []
+        for mask_cls_result, mask_pred_result, meta in zip(
+                mask_cls_results, mask_pred_results, img_metas):
+            # remove padding
+            img_height, img_width = meta['img_shape'][:2]
+            mask_pred_result = mask_pred_result[:, :img_height, :img_width]
+
+            if rescale:
+                # return result in original resolution
+                ori_height, ori_width = meta['ori_shape'][:2]
+                mask_pred_result = F.interpolate(
+                    mask_pred_result[:, None],
+                    size=(ori_height, ori_width),
+                    mode='bilinear',
+                    align_corners=False)[:, 0]
+
+            result = dict()
+            if panoptic_on:
+                pan_results = self.panoptic_postprocess(
+                    mask_cls_result, mask_pred_result)
+                result['pan_results'] = pan_results
+
+            if instance_on:
+                ins_results = self.instance_postprocess(
+                    mask_cls_result, mask_pred_result)
+                result['ins_results'] = ins_results
+
+            if semantic_on:
+                sem_results = self.semantic_postprocess(
+                    mask_cls_result, mask_pred_result)
+                result['sem_results'] = sem_results
+
+            results.append(result)
+
+        return results
diff --git a/tests/test_models/test_dense_heads/test_maskformer_head.py b/tests/test_models/test_dense_heads/test_maskformer_head.py
index e70f09afe3f..f9cf3b2326f 100644
--- a/tests/test_models/test_dense_heads/test_maskformer_head.py
+++ b/tests/test_models/test_dense_heads/test_maskformer_head.py
@@ -23,15 +23,17 @@ def test_maskformer_head_loss():
         torch.rand((2, 64 * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i)))
         for i in range(4)
     ]
-
+    num_things_classes = 80
+    num_stuff_classes = 53
+    num_classes = num_things_classes + num_stuff_classes
     config = ConfigDict(
         dict(
             type='MaskFormerHead',
             in_channels=[base_channels * 2**i for i in range(4)],
             feat_channels=base_channels,
             out_channels=base_channels,
-            num_things_classes=80,
-            num_stuff_classes=53,
+            num_things_classes=num_things_classes,
+            num_stuff_classes=num_stuff_classes,
             num_queries=100,
             pixel_decoder=dict(
                 type='TransformerEncoderPixelDecoder',
@@ -102,11 +104,10 @@ def test_maskformer_head_loss():
                 init_cfg=None),
             loss_cls=dict(
                 type='CrossEntropyLoss',
-                bg_cls_weight=0.1,
                 use_sigmoid=False,
                 loss_weight=1.0,
                 reduction='mean',
-                class_weight=1.0),
+                class_weight=[1.0] * num_classes + [0.1]),
             loss_mask=dict(
                 type='FocalLoss',
                 use_sigmoid=True,
diff --git a/tests/test_models/test_seg_heads/test_maskformer_fusion_head.py b/tests/test_models/test_seg_heads/test_maskformer_fusion_head.py
new file mode 100644
index 00000000000..8d5131f9a60
--- /dev/null
+++ b/tests/test_models/test_seg_heads/test_maskformer_fusion_head.py
@@ -0,0 +1,53 @@
+import pytest
+import torch
+from mmcv import ConfigDict
+
+from mmdet.models.seg_heads.panoptic_fusion_heads import MaskFormerFusionHead
+
+
+def test_maskformer_fusion_head():
+    img_metas = [
+        {
+            'batch_input_shape': (128, 160),
+            'img_shape': (126, 160, 3),
+            'ori_shape': (63, 80, 3),
+            'pad_shape': (128, 160, 3)
+        },
+    ]
+    num_things_classes = 80
+    num_stuff_classes = 53
+    num_classes = num_things_classes + num_stuff_classes
+    config = ConfigDict(
+        type='MaskFormerFusionHead',
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        loss_panoptic=None,
+        test_cfg=dict(
+            panoptic_on=True,
+            semantic_on=False,
+            instance_on=True,
+            max_per_image=100,
+            object_mask_thr=0.8,
+            iou_thr=0.8,
+            filter_low_score=False),
+        init_cfg=None)
+
+    self = MaskFormerFusionHead(**config)
+
+    # test forward_train
+    assert self.forward_train() == dict()
+
+    mask_cls_results = torch.rand((1, 100, num_classes + 1))
+    mask_pred_results = torch.rand((1, 100, 128, 160))
+
+    # test panoptic_postprocess and instance_postprocess
+    results = self.simple_test(mask_cls_results, mask_pred_results, img_metas)
+    assert 'ins_results' in results[0] and 'pan_results' in results[0]
+
+    # test semantic_postprocess
+    config.test_cfg.semantic_on = True
+    with pytest.raises(AssertionError):
+        self.simple_test(mask_cls_results, mask_pred_results, img_metas)
+
+    with pytest.raises(NotImplementedError):
+        self.semantic_postprocess(mask_cls_results, mask_pred_results)
diff --git a/tests/test_utils/test_masks.py b/tests/test_utils/test_masks.py
index 7061046a377..226ca61efbb 100644
--- a/tests/test_utils/test_masks.py
+++ b/tests/test_utils/test_masks.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
 
-from mmdet.core import BitmapMasks, PolygonMasks
+from mmdet.core import BitmapMasks, PolygonMasks, mask2bbox
 
 
 def dummy_raw_bitmap_masks(size):
@@ -687,3 +687,27 @@ def test_polygon_mask_iter():
     polygon_masks = PolygonMasks(raw_masks, 28, 28)
     for i, polygon_mask in enumerate(polygon_masks):
         assert np.equal(polygon_mask, raw_masks[i]).all()
+
+
+def test_mask2bbox():
+    # no instance
+    masks = torch.zeros((1, 20, 15), dtype=torch.bool)
+    bboxes_empty_gt = torch.tensor([[0, 0, 0, 0]]).float()
+    bboxes = mask2bbox(masks)
+    assert torch.allclose(bboxes_empty_gt.float(), bboxes)
+
+    # the entire mask is an instance
+    bboxes_full_gt = torch.tensor([[0, 0, 15, 20]]).float()
+    masks = torch.ones((1, 20, 15), dtype=torch.bool)
+    bboxes = mask2bbox(masks)
+    assert torch.allclose(bboxes_full_gt, bboxes)
+
+    # a pentagon-shaped instance
+    bboxes_gt = torch.tensor([[2, 2, 7, 6]]).float()
+    masks = torch.zeros((1, 20, 15), dtype=torch.bool)
+    masks[0, 2, 4] = True
+    masks[0, 3, 3:6] = True
+    masks[0, 4, 2:7] = True
+    masks[0, 5, 2:7] = True
+    bboxes = mask2bbox(masks)
+    assert torch.allclose(bboxes_gt, bboxes)

From 3f0f2a059743593fd07b629c261b609bd9a767e6 Mon Sep 17 00:00:00 2001
From: jbwang1997 <jbwang1997@gmail.com>
Date: Fri, 25 Mar 2022 17:37:49 +0800
Subject: [PATCH 31/42] [Feature] Support efficientnet in mmdetection. (#7514)

* Initial implementation

* Add missing import

* Add MemoryEfficientSwishImplementation. Add docstrings

* Add efficientnet2mmdet tool

* Add config folder

* Flake8

* Flake8

* Flake8

* Fix config

* Requested changes

* docformatter

* Update train config from https://github.com/google/automl/blob/master/efficientdet

* Run pre-commit

* Fix schedule

* Set by_epoch=False in scheduler

* Train 80 epochs

* Remove duplicated arg

* Update README.md

* efficient3 efficient0

* efficientNet imports

* efficientNet

* config edit path for eff3 and dropout for eff0

* efficientnet review2

* fix model_converter location and drop path

* fix model converter  and efficientnet import

* register memoryefficietnswish

* eff0, eff3

* fix  flake8 yapf isort

* same padding in tensorflow and edit drop path rate

* fix init of utils

* Align mmdet utils with mmcls

* Align mmdet.models.utils with mmcls

* Use mmcls efficientnet backbone

* Update

* Update

* Update metafile

Co-authored-by: David de la Iglesia Castro <daviddelaiglesiacastro@gmail.com>
Co-authored-by: David de la Iglesia Castro <diglesia@gradiant.org>
Co-authored-by: jiangyitong <jiangyitong1@sensetime.com>
Co-authored-by: jiangyitong <jiangyitong1998@outlook.com>
---
 configs/efficientnet/README.md                |  30 ++
 configs/efficientnet/metafile.yml             |  19 +
 ...retinanet_effb3_fpn_crop896_8x4_1x_coco.py |  93 ++++
 mmdet/models/backbones/__init__.py            |   4 +-
 mmdet/models/backbones/efficientnet.py        | 417 ++++++++++++++++++
 mmdet/models/utils/inverted_residual.py       |   8 +-
 model-index.yml                               |   1 +
 .../test_backbones/test_efficientnet.py       |  25 ++
 8 files changed, 595 insertions(+), 2 deletions(-)
 create mode 100644 configs/efficientnet/README.md
 create mode 100644 configs/efficientnet/metafile.yml
 create mode 100644 configs/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco.py
 create mode 100644 mmdet/models/backbones/efficientnet.py
 create mode 100644 tests/test_models/test_backbones/test_efficientnet.py

diff --git a/configs/efficientnet/README.md b/configs/efficientnet/README.md
new file mode 100644
index 00000000000..49f773350ee
--- /dev/null
+++ b/configs/efficientnet/README.md
@@ -0,0 +1,30 @@
+# EfficientNet
+
+> [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946v5)
+
+<!-- [BACKBONE] -->
+
+## Introduction
+
+Convolutional Neural Networks (ConvNets) are commonly developed at a fixed resource budget, and then scaled up for better accuracy if more resources are available. In this paper, we systematically study model scaling and identify that carefully balancing network depth, width, and resolution can lead to better performance. Based on this observation, we propose a new scaling method that uniformly scales all dimensions of depth/width/resolution using a simple yet highly effective compound coefficient. We demonstrate the effectiveness of this method on scaling up MobileNets and ResNet.
+
+To go even further, we use neural architecture search to design a new baseline network and scale it up to obtain a family of models, called EfficientNets, which achieve much better accuracy and efficiency than previous ConvNets. In particular, our EfficientNet-B7 achieves state-of-the-art 84.3% top-1 accuracy on ImageNet, while being 8.4x smaller and 6.1x faster on inference than the best existing ConvNet. Our EfficientNets also transfer well and achieve state-of-the-art accuracy on CIFAR-100 (91.7%), Flowers (98.8%), and 3 other transfer learning datasets, with an order of magnitude fewer parameters.
+
+## Results and Models
+
+### RetinaNet
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download |
+| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: |
+|Efficientnet-b3  | pytorch |   1x    |   -      |   -           |  40.5  |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco.py) | [model]() &#124; [log]() |
+
+## Citation
+
+```latex
+@article{tan2019efficientnet,
+  title={Efficientnet: Rethinking model scaling for convolutional neural networks},
+  author={Tan, Mingxing and Le, Quoc V},
+  journal={arXiv preprint arXiv:1905.11946},
+  year={2019}
+}
+```
diff --git a/configs/efficientnet/metafile.yml b/configs/efficientnet/metafile.yml
new file mode 100644
index 00000000000..5b05a6c11d5
--- /dev/null
+++ b/configs/efficientnet/metafile.yml
@@ -0,0 +1,19 @@
+Models:
+  - Name: retinanet_effb3_fpn_crop896_8x4_1x_coco
+    In Collection: RetinaNet
+    Config: configs/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: url
+    Paper:
+      URL: https://arxiv.org/abs/1905.11946v5
+      Title: 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks'
+    README: configs/efficientnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.23.0/mmdet/models/backbones/efficientnet.py#L159
+      Version: v2.23.0
diff --git a/configs/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco.py b/configs/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco.py
new file mode 100644
index 00000000000..465b3dc1f58
--- /dev/null
+++ b/configs/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco.py
@@ -0,0 +1,93 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+
+cudnn_benchmark = True
+norm_cfg = dict(type='BN', requires_grad=True)
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa_in1k_20220119-5b4887a0.pth'  # noqa
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='EfficientNet',
+        arch='b3',
+        drop_path_rate=0.2,
+        out_indices=(3, 4, 5),
+        frozen_stages=0,
+        norm_cfg=dict(
+            type='SyncBN', requires_grad=True, eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone', checkpoint=checkpoint)),
+    neck=dict(
+        in_channels=[48, 136, 384],
+        start_level=0,
+        out_channels=256,
+        relu_before_extra_convs=True,
+        no_norm_on_lateral=True,
+        norm_cfg=norm_cfg),
+    bbox_head=dict(type='RetinaSepBNHead', num_ins=5, norm_cfg=norm_cfg),
+    # training and testing settings
+    train_cfg=dict(assigner=dict(neg_iou_thr=0.5)))
+
+# dataset settings
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_size = (896, 896)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=img_size,
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=img_size),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=img_size),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=img_size,
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size=img_size),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer_config = dict(grad_clip=None)
+optimizer = dict(
+    type='SGD',
+    lr=0.04,
+    momentum=0.9,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.1,
+    step=[8, 11])
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=12)
+
+# NOTE: This variable is for automatically scaling LR,
+# USER SHOULD NOT CHANGE THIS VALUE.
+default_batch_size = 32  # (8 GPUs) x (4 samples per GPU)
diff --git a/mmdet/models/backbones/__init__.py b/mmdet/models/backbones/__init__.py
index 8a4e9cd1b2d..91b50d254a8 100644
--- a/mmdet/models/backbones/__init__.py
+++ b/mmdet/models/backbones/__init__.py
@@ -3,6 +3,7 @@
 from .darknet import Darknet
 from .detectors_resnet import DetectoRS_ResNet
 from .detectors_resnext import DetectoRS_ResNeXt
+from .efficientnet import EfficientNet
 from .hourglass import HourglassNet
 from .hrnet import HRNet
 from .mobilenet_v2 import MobileNetV2
@@ -20,5 +21,6 @@
     'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet',
     'MobileNetV2', 'Res2Net', 'HourglassNet', 'DetectoRS_ResNet',
     'DetectoRS_ResNeXt', 'Darknet', 'ResNeSt', 'TridentResNet', 'CSPDarknet',
-    'SwinTransformer', 'PyramidVisionTransformer', 'PyramidVisionTransformerV2'
+    'SwinTransformer', 'PyramidVisionTransformer',
+    'PyramidVisionTransformerV2', 'EfficientNet'
 ]
diff --git a/mmdet/models/backbones/efficientnet.py b/mmdet/models/backbones/efficientnet.py
new file mode 100644
index 00000000000..7ee359567d9
--- /dev/null
+++ b/mmdet/models/backbones/efficientnet.py
@@ -0,0 +1,417 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn.bricks import ConvModule, DropPath
+from mmcv.runner import BaseModule, Sequential
+
+from ..builder import BACKBONES
+from ..utils import InvertedResidual, SELayer, make_divisible
+
+
+class EdgeResidual(BaseModule):
+    """Edge Residual Block.
+
+    Args:
+        in_channels (int): The input channels of this module.
+        out_channels (int): The output channels of this module.
+        mid_channels (int): The input channels of the second convolution.
+        kernel_size (int): The kernel size of the first convolution.
+            Defaults to 3.
+        stride (int): The stride of the first convolution. Defaults to 1.
+        se_cfg (dict, optional): Config dict for se layer. Defaults to None,
+            which means no se layer.
+        with_residual (bool): Use residual connection. Defaults to True.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to ``dict(type='BN')``.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to ``dict(type='ReLU')``.
+        drop_path_rate (float): stochastic depth rate. Defaults to 0.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+        init_cfg (dict | list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 stride=1,
+                 se_cfg=None,
+                 with_residual=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 drop_path_rate=0.,
+                 with_cp=False,
+                 init_cfg=None,
+                 **kwargs):
+        super(EdgeResidual, self).__init__(init_cfg=init_cfg)
+        assert stride in [1, 2]
+        self.with_cp = with_cp
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.with_se = se_cfg is not None
+        self.with_residual = (
+            stride == 1 and in_channels == out_channels and with_residual)
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+
+        self.conv1 = ConvModule(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+
+        self.conv2 = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+            out = self.conv1(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.conv2(out)
+
+            if self.with_residual:
+                return x + self.drop_path(out)
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+def model_scaling(layer_setting, arch_setting):
+    """Scaling operation to the layer's parameters according to the
+    arch_setting."""
+    # scale width
+    new_layer_setting = copy.deepcopy(layer_setting)
+    for layer_cfg in new_layer_setting:
+        for block_cfg in layer_cfg:
+            block_cfg[1] = make_divisible(block_cfg[1] * arch_setting[0], 8)
+
+    # scale depth
+    split_layer_setting = [new_layer_setting[0]]
+    for layer_cfg in new_layer_setting[1:-1]:
+        tmp_index = [0]
+        for i in range(len(layer_cfg) - 1):
+            if layer_cfg[i + 1][1] != layer_cfg[i][1]:
+                tmp_index.append(i + 1)
+        tmp_index.append(len(layer_cfg))
+        for i in range(len(tmp_index) - 1):
+            split_layer_setting.append(layer_cfg[tmp_index[i]:tmp_index[i +
+                                                                        1]])
+    split_layer_setting.append(new_layer_setting[-1])
+
+    num_of_layers = [len(layer_cfg) for layer_cfg in split_layer_setting[1:-1]]
+    new_layers = [
+        int(math.ceil(arch_setting[1] * num)) for num in num_of_layers
+    ]
+
+    merge_layer_setting = [split_layer_setting[0]]
+    for i, layer_cfg in enumerate(split_layer_setting[1:-1]):
+        if new_layers[i] <= num_of_layers[i]:
+            tmp_layer_cfg = layer_cfg[:new_layers[i]]
+        else:
+            tmp_layer_cfg = copy.deepcopy(layer_cfg) + [layer_cfg[-1]] * (
+                new_layers[i] - num_of_layers[i])
+        if tmp_layer_cfg[0][3] == 1 and i != 0:
+            merge_layer_setting[-1] += tmp_layer_cfg.copy()
+        else:
+            merge_layer_setting.append(tmp_layer_cfg.copy())
+    merge_layer_setting.append(split_layer_setting[-1])
+
+    return merge_layer_setting
+
+
+@BACKBONES.register_module()
+class EfficientNet(BaseModule):
+    """EfficientNet backbone.
+
+    Args:
+        arch (str): Architecture of efficientnet. Defaults to b0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (6, ).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Defaults to 0, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='Swish').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+    """
+
+    # Parameters to build layers.
+    # 'b' represents the architecture of normal EfficientNet family includes
+    # 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8'.
+    # 'e' represents the architecture of EfficientNet-EdgeTPU including 'es',
+    # 'em', 'el'.
+    # 6 parameters are needed to construct a layer, From left to right:
+    # - kernel_size: The kernel size of the block
+    # - out_channel: The number of out_channels of the block
+    # - se_ratio: The sequeeze ratio of SELayer.
+    # - stride: The stride of the block
+    # - expand_ratio: The expand_ratio of the mid_channels
+    # - block_type: -1: Not a block, 0: InvertedResidual, 1: EdgeResidual
+    layer_settings = {
+        'b': [[[3, 32, 0, 2, 0, -1]],
+              [[3, 16, 4, 1, 1, 0]],
+              [[3, 24, 4, 2, 6, 0],
+               [3, 24, 4, 1, 6, 0]],
+              [[5, 40, 4, 2, 6, 0],
+               [5, 40, 4, 1, 6, 0]],
+              [[3, 80, 4, 2, 6, 0],
+               [3, 80, 4, 1, 6, 0],
+               [3, 80, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0]],
+              [[5, 192, 4, 2, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [3, 320, 4, 1, 6, 0]],
+              [[1, 1280, 0, 1, 0, -1]]
+              ],
+        'e': [[[3, 32, 0, 2, 0, -1]],
+              [[3, 24, 0, 1, 3, 1]],
+              [[3, 32, 0, 2, 8, 1],
+               [3, 32, 0, 1, 8, 1]],
+              [[3, 48, 0, 2, 8, 1],
+               [3, 48, 0, 1, 8, 1],
+               [3, 48, 0, 1, 8, 1],
+               [3, 48, 0, 1, 8, 1]],
+              [[5, 96, 0, 2, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0]],
+              [[5, 192, 0, 2, 8, 0],
+               [5, 192, 0, 1, 8, 0]],
+              [[1, 1280, 0, 1, 0, -1]]
+              ]
+    }  # yapf: disable
+
+    # Parameters to build different kinds of architecture.
+    # From left to right: scaling factor for width, scaling factor for depth,
+    # resolution.
+    arch_settings = {
+        'b0': (1.0, 1.0, 224),
+        'b1': (1.0, 1.1, 240),
+        'b2': (1.1, 1.2, 260),
+        'b3': (1.2, 1.4, 300),
+        'b4': (1.4, 1.8, 380),
+        'b5': (1.6, 2.2, 456),
+        'b6': (1.8, 2.6, 528),
+        'b7': (2.0, 3.1, 600),
+        'b8': (2.2, 3.6, 672),
+        'es': (1.0, 1.0, 224),
+        'em': (1.0, 1.1, 240),
+        'el': (1.2, 1.4, 300)
+    }
+
+    def __init__(self,
+                 arch='b0',
+                 drop_path_rate=0.,
+                 out_indices=(6, ),
+                 frozen_stages=0,
+                 conv_cfg=dict(type='Conv2dAdaptivePadding'),
+                 norm_cfg=dict(type='BN', eps=1e-3),
+                 act_cfg=dict(type='Swish'),
+                 norm_eval=False,
+                 with_cp=False,
+                 init_cfg=[
+                     dict(type='Kaiming', layer='Conv2d'),
+                     dict(
+                         type='Constant',
+                         layer=['_BatchNorm', 'GroupNorm'],
+                         val=1)
+                 ]):
+        super(EfficientNet, self).__init__(init_cfg)
+        assert arch in self.arch_settings, \
+            f'"{arch}" is not one of the arch_settings ' \
+            f'({", ".join(self.arch_settings.keys())})'
+        self.arch_setting = self.arch_settings[arch]
+        self.layer_setting = self.layer_settings[arch[:1]]
+        for index in out_indices:
+            if index not in range(0, len(self.layer_setting)):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, {len(self.layer_setting)}). '
+                                 f'But received {index}')
+
+        if frozen_stages not in range(len(self.layer_setting) + 1):
+            raise ValueError('frozen_stages must be in range(0, '
+                             f'{len(self.layer_setting) + 1}). '
+                             f'But received {frozen_stages}')
+        self.drop_path_rate = drop_path_rate
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.layer_setting = model_scaling(self.layer_setting,
+                                           self.arch_setting)
+        block_cfg_0 = self.layer_setting[0][0]
+        block_cfg_last = self.layer_setting[-1][0]
+        self.in_channels = make_divisible(block_cfg_0[1], 8)
+        self.out_channels = block_cfg_last[1]
+        self.layers = nn.ModuleList()
+        self.layers.append(
+            ConvModule(
+                in_channels=3,
+                out_channels=self.in_channels,
+                kernel_size=block_cfg_0[0],
+                stride=block_cfg_0[3],
+                padding=block_cfg_0[0] // 2,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+        self.make_layer()
+        # Avoid building unused layers in mmdetection.
+        if len(self.layers) < max(self.out_indices) + 1:
+            self.layers.append(
+                ConvModule(
+                    in_channels=self.in_channels,
+                    out_channels=self.out_channels,
+                    kernel_size=block_cfg_last[0],
+                    stride=block_cfg_last[3],
+                    padding=block_cfg_last[0] // 2,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+    def make_layer(self):
+        # Without the first and the final conv block.
+        layer_setting = self.layer_setting[1:-1]
+
+        total_num_blocks = sum([len(x) for x in layer_setting])
+        block_idx = 0
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, self.drop_path_rate, total_num_blocks)
+        ]  # stochastic depth decay rule
+
+        for i, layer_cfg in enumerate(layer_setting):
+            # Avoid building unused layers in mmdetection.
+            if i > max(self.out_indices) - 1:
+                break
+            layer = []
+            for i, block_cfg in enumerate(layer_cfg):
+                (kernel_size, out_channels, se_ratio, stride, expand_ratio,
+                 block_type) = block_cfg
+
+                mid_channels = int(self.in_channels * expand_ratio)
+                out_channels = make_divisible(out_channels, 8)
+                if se_ratio <= 0:
+                    se_cfg = None
+                else:
+                    # In mmdetection, the `divisor` is deleted to align
+                    # the logic of SELayer with mmcls.
+                    se_cfg = dict(
+                        channels=mid_channels,
+                        ratio=expand_ratio * se_ratio,
+                        act_cfg=(self.act_cfg, dict(type='Sigmoid')))
+                if block_type == 1:  # edge tpu
+                    if i > 0 and expand_ratio == 3:
+                        with_residual = False
+                        expand_ratio = 4
+                    else:
+                        with_residual = True
+                    mid_channels = int(self.in_channels * expand_ratio)
+                    if se_cfg is not None:
+                        # In mmdetection, the `divisor` is deleted to align
+                        # the logic of SELayer with mmcls.
+                        se_cfg = dict(
+                            channels=mid_channels,
+                            ratio=se_ratio * expand_ratio,
+                            act_cfg=(self.act_cfg, dict(type='Sigmoid')))
+                    block = partial(EdgeResidual, with_residual=with_residual)
+                else:
+                    block = InvertedResidual
+                layer.append(
+                    block(
+                        in_channels=self.in_channels,
+                        out_channels=out_channels,
+                        mid_channels=mid_channels,
+                        kernel_size=kernel_size,
+                        stride=stride,
+                        se_cfg=se_cfg,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg,
+                        drop_path_rate=dpr[block_idx],
+                        with_cp=self.with_cp,
+                        # In mmdetection, `with_expand_conv` is set to align
+                        # the logic of InvertedResidual with mmcls.
+                        with_expand_conv=(mid_channels != self.in_channels)))
+                self.in_channels = out_channels
+                block_idx += 1
+            self.layers.append(Sequential(*layer))
+
+    def forward(self, x):
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        for i in range(self.frozen_stages):
+            m = self.layers[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super(EfficientNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
diff --git a/mmdet/models/utils/inverted_residual.py b/mmdet/models/utils/inverted_residual.py
index bc9ce68955d..1f241ae3e43 100644
--- a/mmdet/models/utils/inverted_residual.py
+++ b/mmdet/models/utils/inverted_residual.py
@@ -1,6 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
 import torch.utils.checkpoint as cp
 from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import DropPath
 from mmcv.runner import BaseModule
 
 from .se_layer import SELayer
@@ -27,6 +29,7 @@ class InvertedResidual(BaseModule):
             Default: dict(type='BN').
         act_cfg (dict): Config dict for activation layer.
             Default: dict(type='ReLU').
+        drop_path_rate (float): stochastic depth rate. Defaults to 0.
         with_cp (bool): Use checkpoint or not. Using checkpoint will save some
             memory while slowing down the training speed. Default: False.
         init_cfg (dict or list[dict], optional): Initialization config dict.
@@ -47,6 +50,7 @@ def __init__(self,
                  conv_cfg=None,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
+                 drop_path_rate=0.,
                  with_cp=False,
                  init_cfg=None):
         super(InvertedResidual, self).__init__(init_cfg)
@@ -54,6 +58,8 @@ def __init__(self,
         assert stride in [1, 2], f'stride must in [1, 2]. ' \
             f'But received {stride}.'
         self.with_cp = with_cp
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0 else nn.Identity()
         self.with_se = se_cfg is not None
         self.with_expand_conv = with_expand_conv
 
@@ -112,7 +118,7 @@ def _inner_forward(x):
             out = self.linear_conv(out)
 
             if self.with_res_shortcut:
-                return x + out
+                return x + self.drop_path(out)
             else:
                 return out
 
diff --git a/model-index.yml b/model-index.yml
index e05ab8d2964..cecf50a8568 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -15,6 +15,7 @@ Import:
   - configs/double_heads/metafile.yml
   - configs/dyhead/metafile.yml
   - configs/dynamic_rcnn/metafile.yml
+  - configs/efficientnet/metafile.yml
   - configs/empirical_attention/metafile.yml
   - configs/faster_rcnn/metafile.yml
   - configs/fcos/metafile.yml
diff --git a/tests/test_models/test_backbones/test_efficientnet.py b/tests/test_models/test_backbones/test_efficientnet.py
new file mode 100644
index 00000000000..aa217704819
--- /dev/null
+++ b/tests/test_models/test_backbones/test_efficientnet.py
@@ -0,0 +1,25 @@
+import pytest
+import torch
+
+from mmdet.models.backbones import EfficientNet
+
+
+def test_efficientnet_backbone():
+    """Test EfficientNet backbone."""
+    with pytest.raises(AssertionError):
+        # EfficientNet arch should be a key in EfficientNet.arch_settings
+        EfficientNet(arch='c3')
+
+    model = EfficientNet(arch='b0', out_indices=(0, 1, 2, 3, 4, 5, 6))
+    model.train()
+
+    imgs = torch.randn(2, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 7
+    assert feat[0].shape == torch.Size([2, 32, 16, 16])
+    assert feat[1].shape == torch.Size([2, 16, 16, 16])
+    assert feat[2].shape == torch.Size([2, 24, 8, 8])
+    assert feat[3].shape == torch.Size([2, 40, 4, 4])
+    assert feat[4].shape == torch.Size([2, 112, 2, 2])
+    assert feat[5].shape == torch.Size([2, 320, 1, 1])
+    assert feat[6].shape == torch.Size([2, 1280, 1, 1])

From 3b2e9655631a2edd28bb94c640bd6a74c0bfad55 Mon Sep 17 00:00:00 2001
From: RangiLyu <lyuchqi@gmail.com>
Date: Fri, 25 Mar 2022 23:01:32 +0800
Subject: [PATCH 32/42] [Fix] Fix reduction=mean in CELoss. (#7449)

* [Fix] Fix ignore in CELoss.

* add ut

* fix and add comments

* add avg_non_ignore option

* bce avg

* fix lint
---
 .../dense_heads/free_anchor_retina_head.py    |  3 +-
 mmdet/models/losses/cross_entropy_loss.py     | 61 ++++++++++++++++---
 mmdet/models/losses/utils.py                  |  6 +-
 tests/test_models/test_loss.py                | 31 +++++++---
 4 files changed, 81 insertions(+), 20 deletions(-)

diff --git a/mmdet/models/dense_heads/free_anchor_retina_head.py b/mmdet/models/dense_heads/free_anchor_retina_head.py
index fa4238974da..3acd25ecba4 100644
--- a/mmdet/models/dense_heads/free_anchor_retina_head.py
+++ b/mmdet/models/dense_heads/free_anchor_retina_head.py
@@ -79,7 +79,8 @@ def loss(self,
         featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
         assert len(featmap_sizes) == self.prior_generator.num_levels
         device = cls_scores[0].device
-        anchor_list, _ = self.get_anchors(featmap_sizes, img_metas, device=device)
+        anchor_list, _ = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
         anchors = [torch.cat(anchor) for anchor in anchor_list]
 
         # concatenate each level
diff --git a/mmdet/models/losses/cross_entropy_loss.py b/mmdet/models/losses/cross_entropy_loss.py
index 5777aebd290..97f12e50375 100644
--- a/mmdet/models/losses/cross_entropy_loss.py
+++ b/mmdet/models/losses/cross_entropy_loss.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -13,7 +15,8 @@ def cross_entropy(pred,
                   reduction='mean',
                   avg_factor=None,
                   class_weight=None,
-                  ignore_index=-100):
+                  ignore_index=-100,
+                  avg_non_ignore=False):
     """Calculate the CrossEntropy loss.
 
     Args:
@@ -27,6 +30,8 @@ def cross_entropy(pred,
         class_weight (list[float], optional): The weight for each class.
         ignore_index (int | None): The label index to be ignored.
             If None, it will be set to default value. Default: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
 
     Returns:
         torch.Tensor: The calculated loss
@@ -41,6 +46,12 @@ def cross_entropy(pred,
         reduction='none',
         ignore_index=ignore_index)
 
+    # average loss over non-ignored elements
+    # pytorch's official cross_entropy average loss over non-ignored elements
+    # refer to https://github.com/pytorch/pytorch/blob/56b43f4fec1f76953f15a627694d4bba34588969/torch/nn/functional.py#L2660  # noqa
+    if (avg_factor is None) and avg_non_ignore and reduction == 'mean':
+        avg_factor = label.numel() - (label == ignore_index).sum().item()
+
     # apply weights and do the reduction
     if weight is not None:
         weight = weight.float()
@@ -68,7 +79,7 @@ def _expand_onehot_labels(labels, label_weights, label_channels, ignore_index):
         bin_label_weights = label_weights.view(-1, 1).repeat(1, label_channels)
         bin_label_weights *= valid_mask
 
-    return bin_labels, bin_label_weights
+    return bin_labels, bin_label_weights, valid_mask
 
 
 def binary_cross_entropy(pred,
@@ -77,7 +88,8 @@ def binary_cross_entropy(pred,
                          reduction='mean',
                          avg_factor=None,
                          class_weight=None,
-                         ignore_index=-100):
+                         ignore_index=-100,
+                         avg_non_ignore=False):
     """Calculate the binary CrossEntropy loss.
 
     Args:
@@ -95,19 +107,32 @@ def binary_cross_entropy(pred,
         class_weight (list[float], optional): The weight for each class.
         ignore_index (int | None): The label index to be ignored.
             If None, it will be set to default value. Default: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
 
     Returns:
         torch.Tensor: The calculated loss.
     """
     # The default value of ignore_index is the same as F.cross_entropy
     ignore_index = -100 if ignore_index is None else ignore_index
+
     if pred.dim() != label.dim():
-        label, weight = _expand_onehot_labels(label, weight, pred.size(-1),
-                                              ignore_index)
+        label, weight, valid_mask = _expand_onehot_labels(
+            label, weight, pred.size(-1), ignore_index)
+    else:
+        # should mask out the ignored elements
+        valid_mask = ((label >= 0) & (label != ignore_index)).float()
+        if weight is not None:
+            weight *= valid_mask
+        else:
+            weight = valid_mask
+
+    # average loss over non-ignored elements
+    if (avg_factor is None) and avg_non_ignore and reduction == 'mean':
+        avg_factor = valid_mask.sum().item()
 
     # weighted element-wise losses
-    if weight is not None:
-        weight = weight.float()
+    weight = weight.float()
     loss = F.binary_cross_entropy_with_logits(
         pred, label.float(), pos_weight=class_weight, reduction='none')
     # do the reduction for the weighted loss
@@ -123,7 +148,8 @@ def mask_cross_entropy(pred,
                        reduction='mean',
                        avg_factor=None,
                        class_weight=None,
-                       ignore_index=None):
+                       ignore_index=None,
+                       **kwargs):
     """Calculate the CrossEntropy loss for masks.
 
     Args:
@@ -177,7 +203,8 @@ def __init__(self,
                  reduction='mean',
                  class_weight=None,
                  ignore_index=None,
-                 loss_weight=1.0):
+                 loss_weight=1.0,
+                 avg_non_ignore=False):
         """CrossEntropyLoss.
 
         Args:
@@ -192,6 +219,8 @@ def __init__(self,
             ignore_index (int | None): The label index to be ignored.
                 Defaults to None.
             loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+            avg_non_ignore (bool): The flag decides to whether the loss is
+                only averaged over non-ignored targets. Default: False.
         """
         super(CrossEntropyLoss, self).__init__()
         assert (use_sigmoid is False) or (use_mask is False)
@@ -201,6 +230,14 @@ def __init__(self,
         self.loss_weight = loss_weight
         self.class_weight = class_weight
         self.ignore_index = ignore_index
+        self.avg_non_ignore = avg_non_ignore
+        if ((ignore_index is not None) and not self.avg_non_ignore
+                and self.reduction == 'mean'):
+            warnings.warn(
+                'Default ``avg_non_ignore`` is False, if you would like to '
+                'ignore the certain label and average loss over non-ignore '
+                'labels, which is the same with PyTorch official '
+                'cross_entropy, set ``avg_non_ignore=True``.')
 
         if self.use_sigmoid:
             self.cls_criterion = binary_cross_entropy
@@ -209,6 +246,11 @@ def __init__(self,
         else:
             self.cls_criterion = cross_entropy
 
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'avg_non_ignore={self.avg_non_ignore}'
+        return s
+
     def forward(self,
                 cls_score,
                 label,
@@ -251,5 +293,6 @@ def forward(self,
             reduction=reduction,
             avg_factor=avg_factor,
             ignore_index=ignore_index,
+            avg_non_ignore=self.avg_non_ignore,
             **kwargs)
         return loss_cls
diff --git a/mmdet/models/losses/utils.py b/mmdet/models/losses/utils.py
index a7ae7e215bc..778237ebfd5 100644
--- a/mmdet/models/losses/utils.py
+++ b/mmdet/models/losses/utils.py
@@ -2,6 +2,7 @@
 import functools
 
 import mmcv
+import torch
 import torch.nn.functional as F
 
 
@@ -48,7 +49,10 @@ def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
     else:
         # if reduction is mean, then average the loss by avg_factor
         if reduction == 'mean':
-            loss = loss.sum() / avg_factor
+            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+            # i.e., all labels of an image belong to ignore index.
+            eps = torch.finfo(torch.float32).eps
+            loss = loss.sum() / (avg_factor + eps)
         # if reduction is 'none', then do nothing, otherwise raise an error
         elif reduction != 'none':
             raise ValueError('avg_factor can not be used with reduction="sum"')
diff --git a/tests/test_models/test_loss.py b/tests/test_models/test_loss.py
index 380bc3263f7..280f3f6ddec 100644
--- a/tests/test_models/test_loss.py
+++ b/tests/test_models/test_loss.py
@@ -135,10 +135,15 @@ def test_GHMR_loss(loss_class, input_shape):
 
 
 @pytest.mark.parametrize('use_sigmoid', [True, False])
-def test_loss_with_ignore_index(use_sigmoid):
+@pytest.mark.parametrize('reduction', ['sum', 'mean', None])
+@pytest.mark.parametrize('avg_non_ignore', [True, False])
+def test_loss_with_ignore_index(use_sigmoid, reduction, avg_non_ignore):
     # Test cross_entropy loss
     loss_class = CrossEntropyLoss(
-        use_sigmoid=use_sigmoid, use_mask=False, ignore_index=255)
+        use_sigmoid=use_sigmoid,
+        use_mask=False,
+        ignore_index=255,
+        avg_non_ignore=avg_non_ignore)
     pred = torch.rand((10, 5))
     target = torch.randint(0, 5, (10, ))
 
@@ -146,24 +151,32 @@ def test_loss_with_ignore_index(use_sigmoid):
     target[ignored_indices] = 255
 
     # Test loss forward with default ignore
-    loss_with_ignore = loss_class(pred, target, reduction_override='sum')
+    loss_with_ignore = loss_class(pred, target, reduction_override=reduction)
     assert isinstance(loss_with_ignore, torch.Tensor)
 
     # Test loss forward with forward ignore
-    target[ignored_indices] = 250
+    target[ignored_indices] = 255
     loss_with_forward_ignore = loss_class(
-        pred, target, ignore_index=250, reduction_override='sum')
+        pred, target, ignore_index=255, reduction_override=reduction)
     assert isinstance(loss_with_forward_ignore, torch.Tensor)
 
     # Verify correctness
-    not_ignored_indices = (target != 250)
-    pred = pred[not_ignored_indices]
-    target = target[not_ignored_indices]
-    loss = loss_class(pred, target, reduction_override='sum')
+    if avg_non_ignore:
+        # manually remove the ignored elements
+        not_ignored_indices = (target != 255)
+        pred = pred[not_ignored_indices]
+        target = target[not_ignored_indices]
+    loss = loss_class(pred, target, reduction_override=reduction)
 
     assert torch.allclose(loss, loss_with_ignore)
     assert torch.allclose(loss, loss_with_forward_ignore)
 
+    # test ignore all target
+    pred = torch.rand((10, 5))
+    target = torch.ones((10, ), dtype=torch.long) * 255
+    loss = loss_class(pred, target, reduction_override=reduction)
+    assert loss == 0
+
 
 @pytest.mark.parametrize('naive_dice', [True, False])
 def test_dice_loss(naive_dice):

From 1c8cdef7af1f121275b0676afaf35c0ab6dbbaa5 Mon Sep 17 00:00:00 2001
From: BigDong <yudongwang@tju.edu.cn>
Date: Fri, 25 Mar 2022 23:03:17 +0800
Subject: [PATCH 33/42] [Enhance] update pascal voc result (#7503)

* [Enhance] update pascal voc result

* minor fix
---
 configs/pascal_voc/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/configs/pascal_voc/README.md b/configs/pascal_voc/README.md
index 514ac5049e5..4e8991a7c2a 100644
--- a/configs/pascal_voc/README.md
+++ b/configs/pascal_voc/README.md
@@ -19,7 +19,9 @@ This paper describes the dataset and evaluation procedure. We review the state-o
 | Architecture | Backbone  | Style   | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download |
 |:------------:|:---------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:|
 | Faster R-CNN | R-50      | pytorch | 1x      | 2.6   | -          | 79.5  |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712/faster_rcnn_r50_fpn_1x_voc0712_20200624-c9895d40.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712/20200623_015208.log.json) |
-| Retinanet    | R-50      | pytorch | 1x      | 2.1   | -          | 77.3  |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc/retinanet_r50_fpn_1x_voc0712.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/retinanet_r50_fpn_1x_voc0712/retinanet_r50_fpn_1x_voc0712_20200617-47cbdd0e.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/retinanet_r50_fpn_1x_voc0712/retinanet_r50_fpn_1x_voc0712_20200616_014642.log.json) |
+| Retinanet    | R-50      | pytorch | 1x      | 2.1   | -          | 79.2  |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc/retinanet_r50_fpn_1x_voc0712.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/retinanet_r50_fpn_1x_voc0712/retinanet_r50_fpn_1x_voc0712_20220320_222034-b30e6097.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/retinanet_r50_fpn_1x_voc0712/retinanet_r50_fpn_1x_voc0712_20220320_222034.log.json) |
+| SSD300    | VGG16      | - | 120e      | -   | -          | 76.5  |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc/ssd300_voc0712.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/ssd300_voc0712/ssd300_voc0712_20220320_194658-17edda1b.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/ssd300_voc0712/ssd300_voc0712_20220320_194658.log.json) |
+| SSD512    | VGG16      | - | 120e      | -   | -          | 79.5  |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc/ssd512_voc0712.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/ssd512_voc0712/ssd512_voc0712_20220320_194717-03cefefe.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/ssd512_voc0712/ssd512_voc0712_20220320_194717.log.json) |
 
 ## Citation
 

From fc8fb168c584b25827c8e91389455c62c0540e10 Mon Sep 17 00:00:00 2001
From: CCODING <ccoding04@gmail.com>
Date: Fri, 25 Mar 2022 23:17:32 +0800
Subject: [PATCH 34/42] [Feature] Support to set data root through commands
 (#7386)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix #6915: Support to set data root through commands

* Support #6915: seperate function in tools/utils.py, support test.py and browse_dataset.py

* update #6915: refactor the code ref @hhaAndroid advice

* support #6915: fix format problem

* supoort corresponding scripts and update doc @hhaAndroid

* updata misc.py as @ZwwWayne and @hhaAndroid

* Update mmdet/utils/misc.py

Co-authored-by: Haian Huang(深度眸) <1286304229@qq.com>

* fix mmdet/utils/misc.py format problem

Co-authored-by: Haian Huang(深度眸) <1286304229@qq.com>
---
 docs/en/3_exist_data_new_model.md        |  6 ++++
 docs/zh_cn/3_exist_data_new_model.md     |  7 +++++
 mmdet/utils/__init__.py                  |  5 ++--
 mmdet/utils/misc.py                      | 38 ++++++++++++++++++++++++
 tools/analysis_tools/analyze_results.py  |  5 ++++
 tools/analysis_tools/benchmark.py        |  5 ++++
 tools/analysis_tools/confusion_matrix.py |  5 ++++
 tools/analysis_tools/eval_metric.py      |  5 ++++
 tools/analysis_tools/optimize_anchors.py |  5 +++-
 tools/misc/browse_dataset.py             |  5 ++++
 tools/misc/print_config.py               |  6 ++++
 tools/test.py                            |  6 +++-
 tools/train.py                           |  7 ++++-
 13 files changed, 100 insertions(+), 5 deletions(-)

diff --git a/docs/en/3_exist_data_new_model.md b/docs/en/3_exist_data_new_model.md
index c69afd20386..7201ffdecb6 100644
--- a/docs/en/3_exist_data_new_model.md
+++ b/docs/en/3_exist_data_new_model.md
@@ -41,6 +41,12 @@ mmdetection
 
 ```
 
+Or you can set your dataset root through
+```bash
+export MMDET_DATASETS=$data_root
+```
+We will replace dataset root with `$MMDET_DATASETS`, so you don't have to modify the corresponding path in config files.
+
 The cityscapes annotations have to be converted into the coco format using `tools/dataset_converters/cityscapes.py`:
 
 ```shell
diff --git a/docs/zh_cn/3_exist_data_new_model.md b/docs/zh_cn/3_exist_data_new_model.md
index 5ac09c01afb..a9c19ca9428 100644
--- a/docs/zh_cn/3_exist_data_new_model.md
+++ b/docs/zh_cn/3_exist_data_new_model.md
@@ -40,6 +40,13 @@ mmdetection
 │   │   ├── VOC2012
 ```
 
+你也可以通过如下方式设定数据集根路径
+```bash
+export MMDET_DATASETS=$data_root
+```
+我们将会使用环境便变量 `$MMDET_DATASETS` 作为数据集的根目录，因此你无需再修改相应配置文件的路径信息。
+
+
 你需要使用脚本 `tools/dataset_converters/cityscapes.py` 将 cityscapes 标注转化为 coco 标注格式。
 
 ```shell
diff --git a/mmdet/utils/__init__.py b/mmdet/utils/__init__.py
index 3873ec09c67..a6635d3c0f2 100644
--- a/mmdet/utils/__init__.py
+++ b/mmdet/utils/__init__.py
@@ -1,10 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .collect_env import collect_env
 from .logger import get_caller_name, get_root_logger, log_img_scale
-from .misc import find_latest_checkpoint
+from .misc import find_latest_checkpoint, update_data_root
 from .setup_env import setup_multi_processes
 
 __all__ = [
     'get_root_logger', 'collect_env', 'find_latest_checkpoint',
-    'setup_multi_processes', 'get_caller_name', 'log_img_scale'
+    'update_data_root', 'setup_multi_processes', 'get_caller_name',
+    'log_img_scale'
 ]
diff --git a/mmdet/utils/misc.py b/mmdet/utils/misc.py
index f5c425300e4..4113672acfb 100644
--- a/mmdet/utils/misc.py
+++ b/mmdet/utils/misc.py
@@ -1,8 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import glob
+import os
 import os.path as osp
 import warnings
 
+import mmcv
+from mmcv.utils import print_log
+
 
 def find_latest_checkpoint(path, suffix='pth'):
     """Find the latest checkpoint from the working directory.
@@ -36,3 +40,37 @@ def find_latest_checkpoint(path, suffix='pth'):
             latest = count
             latest_path = checkpoint
     return latest_path
+
+
+def update_data_root(cfg, logger=None):
+    """Update data root according to env MMDET_DATASETS.
+
+    If set env MMDET_DATASETS, update cfg.data_root according to
+    MMDET_DATASETS. Otherwise, using cfg.data_root as default.
+
+    Args:
+        cfg (mmcv.Config): The model config need to modify
+        logger (logging.Logger | str | None): the way to print msg
+    """
+    assert isinstance(cfg, mmcv.Config), \
+        f'cfg got wrong type: {type(cfg)}, expected mmcv.Config'
+
+    if 'MMDET_DATASETS' in os.environ:
+        dst_root = os.environ['MMDET_DATASETS']
+        print_log(f'MMDET_DATASETS has been set to be {dst_root}.'
+                  f'Using {dst_root} as data root.')
+    else:
+        return
+
+    assert isinstance(cfg, mmcv.Config), \
+        f'cfg got wrong type: {type(cfg)}, expected mmcv.Config'
+
+    def update(cfg, src_str, dst_str):
+        for k, v in cfg.items():
+            if isinstance(v, mmcv.ConfigDict):
+                update(cfg[k], src_str, dst_str)
+            if isinstance(v, str) and src_str in v:
+                cfg[k] = v.replace(src_str, dst_str)
+
+    update(cfg.data, cfg.data_root, dst_root)
+    cfg.data_root = dst_root
diff --git a/tools/analysis_tools/analyze_results.py b/tools/analysis_tools/analyze_results.py
index cb79587a65c..15db07e41c7 100644
--- a/tools/analysis_tools/analyze_results.py
+++ b/tools/analysis_tools/analyze_results.py
@@ -9,6 +9,7 @@
 from mmdet.core.evaluation import eval_map
 from mmdet.core.visualization import imshow_gt_det_bboxes
 from mmdet.datasets import build_dataset, get_loading_pipeline
+from mmdet.utils import update_data_root
 
 
 def bbox_map_eval(det_result, annotation):
@@ -186,6 +187,10 @@ def main():
     mmcv.check_file_exist(args.prediction_path)
 
     cfg = Config.fromfile(args.config)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
     if args.cfg_options is not None:
         cfg.merge_from_dict(args.cfg_options)
     cfg.data.test.test_mode = True
diff --git a/tools/analysis_tools/benchmark.py b/tools/analysis_tools/benchmark.py
index 91f34c74063..2be2d14d7b4 100644
--- a/tools/analysis_tools/benchmark.py
+++ b/tools/analysis_tools/benchmark.py
@@ -13,6 +13,7 @@
 from mmdet.datasets import (build_dataloader, build_dataset,
                             replace_ImageToTensor)
 from mmdet.models import build_detector
+from mmdet.utils import update_data_root
 
 
 def parse_args():
@@ -170,6 +171,10 @@ def main():
     args = parse_args()
 
     cfg = Config.fromfile(args.config)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
     if args.cfg_options is not None:
         cfg.merge_from_dict(args.cfg_options)
 
diff --git a/tools/analysis_tools/confusion_matrix.py b/tools/analysis_tools/confusion_matrix.py
index a2531ba81f9..224b93b314a 100644
--- a/tools/analysis_tools/confusion_matrix.py
+++ b/tools/analysis_tools/confusion_matrix.py
@@ -10,6 +10,7 @@
 
 from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
 from mmdet.datasets import build_dataset
+from mmdet.utils import update_data_root
 
 
 def parse_args():
@@ -230,6 +231,10 @@ def main():
     args = parse_args()
 
     cfg = Config.fromfile(args.config)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
     if args.cfg_options is not None:
         cfg.merge_from_dict(args.cfg_options)
 
diff --git a/tools/analysis_tools/eval_metric.py b/tools/analysis_tools/eval_metric.py
index 1fcdc1c1fa9..a074c9e1850 100644
--- a/tools/analysis_tools/eval_metric.py
+++ b/tools/analysis_tools/eval_metric.py
@@ -5,6 +5,7 @@
 from mmcv import Config, DictAction
 
 from mmdet.datasets import build_dataset
+from mmdet.utils import update_data_root
 
 
 def parse_args():
@@ -48,6 +49,10 @@ def main():
     args = parse_args()
 
     cfg = Config.fromfile(args.config)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
     assert args.eval or args.format_only, (
         'Please specify at least one operation (eval/format the results) with '
         'the argument "--eval", "--format-only"')
diff --git a/tools/analysis_tools/optimize_anchors.py b/tools/analysis_tools/optimize_anchors.py
index d0da0cbc61d..acf72acb26c 100644
--- a/tools/analysis_tools/optimize_anchors.py
+++ b/tools/analysis_tools/optimize_anchors.py
@@ -29,7 +29,7 @@
 
 from mmdet.core import bbox_cxcywh_to_xyxy, bbox_overlaps, bbox_xyxy_to_cxcywh
 from mmdet.datasets import build_dataset
-from mmdet.utils import get_root_logger
+from mmdet.utils import get_root_logger, update_data_root
 
 
 def parse_args():
@@ -325,6 +325,9 @@ def main():
     cfg = args.config
     cfg = Config.fromfile(cfg)
 
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
     input_shape = args.input_shape
     assert len(input_shape) == 2
 
diff --git a/tools/misc/browse_dataset.py b/tools/misc/browse_dataset.py
index 3e70c8b8741..14db64ee050 100644
--- a/tools/misc/browse_dataset.py
+++ b/tools/misc/browse_dataset.py
@@ -11,6 +11,7 @@
 from mmdet.core.utils import mask2ndarray
 from mmdet.core.visualization import imshow_det_bboxes
 from mmdet.datasets.builder import build_dataset
+from mmdet.utils import update_data_root
 
 
 def parse_args():
@@ -55,6 +56,10 @@ def skip_pipeline_steps(config):
         ]
 
     cfg = Config.fromfile(config_path)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
     if cfg_options is not None:
         cfg.merge_from_dict(cfg_options)
     train_data_cfg = cfg.data.train
diff --git a/tools/misc/print_config.py b/tools/misc/print_config.py
index 1b2cb30c24c..7bb20fa60de 100644
--- a/tools/misc/print_config.py
+++ b/tools/misc/print_config.py
@@ -4,6 +4,8 @@
 
 from mmcv import Config, DictAction
 
+from mmdet.utils import update_data_root
+
 
 def parse_args():
     parser = argparse.ArgumentParser(description='Print the whole config')
@@ -42,6 +44,10 @@ def main():
     args = parse_args()
 
     cfg = Config.fromfile(args.config)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
     if args.cfg_options is not None:
         cfg.merge_from_dict(args.cfg_options)
     print(f'Config:\n{cfg.pretty_text}')
diff --git a/tools/test.py b/tools/test.py
index dfbc425869e..baa149d8418 100644
--- a/tools/test.py
+++ b/tools/test.py
@@ -17,7 +17,7 @@
 from mmdet.datasets import (build_dataloader, build_dataset,
                             replace_ImageToTensor)
 from mmdet.models import build_detector
-from mmdet.utils import setup_multi_processes
+from mmdet.utils import setup_multi_processes, update_data_root
 
 
 def parse_args():
@@ -133,6 +133,10 @@ def main():
         raise ValueError('The output file must be a pkl file.')
 
     cfg = Config.fromfile(args.config)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
     if args.cfg_options is not None:
         cfg.merge_from_dict(args.cfg_options)
 
diff --git a/tools/train.py b/tools/train.py
index 2ccc1c88f84..5f184608f1c 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -17,7 +17,8 @@
 from mmdet.apis import init_random_seed, set_random_seed, train_detector
 from mmdet.datasets import build_dataset
 from mmdet.models import build_detector
-from mmdet.utils import collect_env, get_root_logger, setup_multi_processes
+from mmdet.utils import (collect_env, get_root_logger, setup_multi_processes,
+                         update_data_root)
 
 
 def parse_args():
@@ -103,6 +104,10 @@ def main():
     args = parse_args()
 
     cfg = Config.fromfile(args.config)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
     if args.cfg_options is not None:
         cfg.merge_from_dict(args.cfg_options)
 

From 14f0e9585c15c28f0c31dcc3ea352449bbe5eb96 Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Fri, 25 Mar 2022 23:58:46 +0800
Subject: [PATCH 35/42] [Feature] Add Mask2Former to mmdet (#6938)

update doc

update doc format

deepcopy pixel_decoder cfg

move mask_pseudo_sampler cfg to config file

move part of postprocess from head to detector

fix bug in postprocessing

move class setting from head to config file

remove if else

move mask2bbox to mask/util

update docstring

update docstring in result2json

fix bug

update class_weight

add maskformer_fusion_head

add maskformer fusion head

update

add cfg for filter_low_score

update maskformer

update class_weight

update config

update unit test

rename param

update comments in config

rename variable, rm arg, update unit tests

update mask2bbox

add unit test for mask2bbox

replace unsqueeze(1) and squeeze(1)

add unit test for maskformer_fusion_head

update docstrings

update docstring

delete \

remove modification to ce loss

update docstring

update docstring

update docstring of ce loss

update unit test

update docstring

update docstring

update docstring

rename

rename

add msdeformattn pixel decoder

maskformer refactor

add strides in config

remove redundant code

remove redundant code

update unit test

update config

update
---
 .../mask2former_r50_lsj_8x2_50e_coco.py       | 253 +++++++++++
 ...ormer_swin-t-p4-w7-224_lsj_8x2_50e_coco.py |  62 +++
 mmdet/core/bbox/match_costs/__init__.py       |   6 +-
 mmdet/core/bbox/match_costs/match_cost.py     |  65 +++
 mmdet/datasets/coco.py                        |   3 -
 mmdet/datasets/coco_panoptic.py               |  17 +-
 mmdet/models/dense_heads/__init__.py          |   4 +-
 mmdet/models/dense_heads/mask2former_head.py  | 430 ++++++++++++++++++
 mmdet/models/detectors/__init__.py            |   3 +-
 mmdet/models/detectors/mask2former.py         |  27 ++
 .../test_dense_heads/test_mask2former_head.py | 216 +++++++++
 tests/test_models/test_forward.py             | 111 +++++
 tests/test_utils/test_assigner.py             |  24 +
 13 files changed, 1212 insertions(+), 9 deletions(-)
 create mode 100644 configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py
 create mode 100644 configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py
 create mode 100644 mmdet/models/dense_heads/mask2former_head.py
 create mode 100644 mmdet/models/detectors/mask2former.py
 create mode 100644 tests/test_models/test_dense_heads/test_mask2former_head.py

diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py
new file mode 100644
index 00000000000..54d138fce3f
--- /dev/null
+++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py
@@ -0,0 +1,253 @@
+_base_ = [
+    '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py'
+]
+num_things_classes = 80
+num_stuff_classes = 53
+num_classes = num_things_classes + num_stuff_classes
+model = dict(
+    type='Mask2Former',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    panoptic_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[256, 512, 1024, 2048],  # pass to pixel_decoder inside
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        pixel_decoder=dict(
+            type='MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=False,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='SinePositionalEncoding', num_feats=128, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        transformer_decoder=dict(
+            type='DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=9,
+            transformerlayers=dict(
+                type='DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True),
+                feedforward_channels=2048,
+                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0)),
+    panoptic_fusion_head=dict(
+        type='MaskFormerFusionHead',
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        loss_panoptic=None,
+        init_cfg=None),
+    train_cfg=dict(
+        num_points=12544,
+        oversample_ratio=3.0,
+        importance_sample_ratio=0.75,
+        assigner=dict(
+            type='MaskHungarianAssigner',
+            cls_cost=dict(type='ClassificationCost', weight=2.0),
+            mask_cost=dict(
+                type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True),
+            dice_cost=dict(
+                type='DiceCost', weight=5.0, pred_act=True, eps=1.0)),
+        sampler=dict(type='MaskPseudoSampler')),
+    test_cfg=dict(
+        panoptic_on=True,
+        # For now, the dataset does not support
+        # evaluating semantic segmentation metric.
+        semantic_on=False,
+        instance_on=True,
+        # max_per_image is for instance segmentation.
+        max_per_image=100,
+        iou_thr=0.8,
+        # In Mask2Former's panoptic postprocessing,
+        # it will filter mask area where score is less than 0.5 .
+        filter_low_score=True),
+    init_cfg=None)
+
+# dataset settings
+image_size = (1024, 1024)
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(
+        type='LoadPanopticAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    # large scale jittering
+    dict(
+        type='Resize',
+        img_scale=image_size,
+        ratio_range=(0.1, 2.0),
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_size=image_size,
+        crop_type='absolute',
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=image_size),
+    dict(type='DefaultFormatBundle', img_to_float=True),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data_root = 'data/coco/'
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(pipeline=train_pipeline),
+    val=dict(
+        pipeline=test_pipeline,
+        ins_ann_file=data_root + 'annotations/instances_val2017.json',
+    ),
+    test=dict(
+        pipeline=test_pipeline,
+        ins_ann_file=data_root + 'annotations/instances_val2017.json',
+    ))
+
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.05,
+    eps=1e-8,
+    betas=(0.9, 0.999),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': embed_multi,
+            'query_feat': embed_multi,
+            'level_embed': embed_multi,
+        },
+        norm_decay_mult=0.0))
+optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    gamma=0.1,
+    by_epoch=False,
+    step=[327778, 355092],
+    warmup='linear',
+    warmup_by_epoch=False,
+    warmup_ratio=1.0,  # no warmup
+    warmup_iters=10)
+
+max_iters = 368750
+runner = dict(type='IterBasedRunner', max_iters=max_iters)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook', by_epoch=False),
+        dict(type='TensorboardLoggerHook', by_epoch=False)
+    ])
+interval = 200000
+workflow = [('train', interval)]
+checkpoint_config = dict(
+    by_epoch=False, interval=interval, save_last=True, max_keep_ckpts=3)
+
+# Before 365001th iteration, we do evaluation every 200000 iterations.
+# After 365000th iteration, we do evaluation every 368750 iterations,
+# which means do evaluation at the end of training.
+# In all, we do evaluation at the 200000th iteration and the
+# last iteratoin.
+dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)]
+evaluation = dict(
+    interval=interval, dynamic_intervals=dynamic_intervals, metric='PQ')
diff --git a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py
new file mode 100644
index 00000000000..70e3103e482
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py
@@ -0,0 +1,62 @@
+_base_ = ['./mask2former_r50_lsj_8x2_50e_coco.py']
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'  # noqa
+
+depths = [2, 2, 6, 2]
+model = dict(
+    type='Mask2Former',
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    panoptic_head=dict(
+        type='Mask2FormerHead', in_channels=[96, 192, 384, 768]),
+    init_cfg=None)
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.05,
+    eps=1e-8,
+    betas=(0.9, 0.999),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/mmdet/core/bbox/match_costs/__init__.py b/mmdet/core/bbox/match_costs/__init__.py
index 81ee588571e..1b636795082 100644
--- a/mmdet/core/bbox/match_costs/__init__.py
+++ b/mmdet/core/bbox/match_costs/__init__.py
@@ -1,9 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .builder import build_match_cost
-from .match_cost import (BBoxL1Cost, ClassificationCost, DiceCost,
-                         FocalLossCost, IoUCost)
+from .match_cost import (BBoxL1Cost, ClassificationCost, CrossEntropyLossCost,
+                         DiceCost, FocalLossCost, IoUCost)
 
 __all__ = [
     'build_match_cost', 'ClassificationCost', 'BBoxL1Cost', 'IoUCost',
-    'FocalLossCost', 'DiceCost'
+    'FocalLossCost', 'DiceCost', 'CrossEntropyLossCost'
 ]
diff --git a/mmdet/core/bbox/match_costs/match_cost.py b/mmdet/core/bbox/match_costs/match_cost.py
index 3c0a164b3c8..7ac0ad0f6df 100644
--- a/mmdet/core/bbox/match_costs/match_cost.py
+++ b/mmdet/core/bbox/match_costs/match_cost.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
+import torch.nn.functional as F
 
 from mmdet.core.bbox.iou_calculators import bbox_overlaps
 from mmdet.core.bbox.transforms import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
@@ -281,3 +282,67 @@ def __call__(self, mask_preds, gt_masks):
             mask_preds = mask_preds.sigmoid()
         dice_cost = self.binary_mask_dice_loss(mask_preds, gt_masks)
         return dice_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class CrossEntropyLossCost:
+    """CrossEntropyLossCost.
+
+    Args:
+        weight (int | float, optional): loss weight. Defaults to 1.
+        use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+                of softmax. Defaults to True.
+    Examples:
+         >>> from mmdet.core.bbox.match_costs import CrossEntropyLossCost
+         >>> import torch
+         >>> bce = CrossEntropyLossCost(use_sigmoid=True)
+         >>> cls_pred = torch.tensor([[7.6, 1.2], [-1.3, 10]])
+         >>> gt_labels = torch.tensor([[1, 1], [1, 0]])
+         >>> print(bce(cls_pred, gt_labels))
+    """
+
+    def __init__(self, weight=1., use_sigmoid=True):
+        assert use_sigmoid, 'use_sigmoid = False is not supported yet.'
+        self.weight = weight
+        self.use_sigmoid = use_sigmoid
+
+    def _binary_cross_entropy(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): The prediction with shape (num_query, 1, *) or
+                (num_query, *).
+            gt_labels (Tensor): The learning label of prediction with
+                shape (num_gt, *).
+
+        Returns:
+            Tensor: Cross entropy cost matrix in shape (num_query, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1).float()
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        pos = F.binary_cross_entropy_with_logits(
+            cls_pred, torch.ones_like(cls_pred), reduction='none')
+        neg = F.binary_cross_entropy_with_logits(
+            cls_pred, torch.zeros_like(cls_pred), reduction='none')
+        cls_cost = torch.einsum('nc,mc->nm', pos, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg, 1 - gt_labels)
+        cls_cost = cls_cost / n
+
+        return cls_cost
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits.
+            gt_labels (Tensor): Labels.
+
+        Returns:
+            Tensor: Cross entropy cost matrix with weight in
+                shape (num_query, num_gt).
+        """
+        if self.use_sigmoid:
+            cls_cost = self._binary_cross_entropy(cls_pred, gt_labels)
+        else:
+            raise NotImplementedError
+
+        return cls_cost * self.weight
diff --git a/mmdet/datasets/coco.py b/mmdet/datasets/coco.py
index 46e3a6cbdd6..bcdd4df3981 100644
--- a/mmdet/datasets/coco.py
+++ b/mmdet/datasets/coco.py
@@ -405,9 +405,6 @@ def evaluate_det_segm(self,
                 'bbox', 'segm', 'proposal', 'proposal_fast'.
             logger (logging.Logger | str | None): Logger used for printing
                 related information during evaluation. Default: None.
-            jsonfile_prefix (str | None): The prefix of json files. It includes
-                the file path and the prefix of filename, e.g., "a/b/prefix".
-                If not specified, a temp file will be created. Default: None.
             classwise (bool): Whether to evaluating the AP for each class.
             proposal_nums (Sequence[int]): Proposal number used for evaluating
                 recalls, such as recall@100, recall@1000.
diff --git a/mmdet/datasets/coco_panoptic.py b/mmdet/datasets/coco_panoptic.py
index 7afc077cc03..53ef5947d1e 100644
--- a/mmdet/datasets/coco_panoptic.py
+++ b/mmdet/datasets/coco_panoptic.py
@@ -457,8 +457,20 @@ def results2json(self, results, outfile_prefix):
         different data types. This method will automatically recognize
         the type, and dump them to json files.
 
+        .. code-block:: none
+
+            [
+                {
+                    'pan_results': np.array, # shape (h, w)
+                    # ins_results which includes bboxes and RLE encoded masks
+                    # is optional.
+                    'ins_results': (list[np.array], list[list[str]])
+                },
+                ...
+            ]
+
         Args:
-            results (dict): Testing results of the dataset.
+            results (list[dict]): Testing results of the dataset.
             outfile_prefix (str): The filename prefix of the json files. If the
                 prefix is "somepath/xxx", the json files will be named
                 "somepath/xxx.panoptic.json", "somepath/xxx.bbox.json",
@@ -597,6 +609,7 @@ def evaluate(self,
         if 'PQ' in metrics:
             eval_pan_results = self.evaluate_pan_json(
                 result_files, outfile_prefix, logger, classwise, nproc=nproc)
+
             eval_results.update(eval_pan_results)
             metrics.remove('PQ')
 
@@ -611,11 +624,13 @@ def evaluate(self,
                 'shuold not be None'
 
             coco_gt = COCO(self.ins_ann_file)
+            panoptic_cat_ids = self.cat_ids
             self.cat_ids = coco_gt.get_cat_ids(cat_names=self.THING_CLASSES)
 
             eval_ins_results = self.evaluate_det_segm(results, result_files,
                                                       coco_gt, metrics, logger,
                                                       classwise, **kwargs)
+            self.cat_ids = panoptic_cat_ids
             eval_results.update(eval_ins_results)
 
         if tmp_dir is not None:
diff --git a/mmdet/models/dense_heads/__init__.py b/mmdet/models/dense_heads/__init__.py
index e931e608028..375197a6987 100644
--- a/mmdet/models/dense_heads/__init__.py
+++ b/mmdet/models/dense_heads/__init__.py
@@ -20,6 +20,7 @@
 from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead
 from .lad_head import LADHead
 from .ld_head import LDHead
+from .mask2former_head import Mask2FormerHead
 from .maskformer_head import MaskFormerHead
 from .nasfcos_head import NASFCOSHead
 from .paa_head import PAAHead
@@ -50,5 +51,6 @@
     'CascadeRPNHead', 'EmbeddingRPNHead', 'LDHead', 'CascadeRPNHead',
     'AutoAssignHead', 'DETRHead', 'YOLOFHead', 'DeformableDETRHead',
     'SOLOHead', 'DecoupledSOLOHead', 'CenterNetHead', 'YOLOXHead',
-    'DecoupledSOLOLightHead', 'LADHead', 'TOODHead', 'MaskFormerHead'
+    'DecoupledSOLOLightHead', 'LADHead', 'TOODHead', 'MaskFormerHead',
+    'Mask2FormerHead'
 ]
diff --git a/mmdet/models/dense_heads/mask2former_head.py b/mmdet/models/dense_heads/mask2former_head.py
new file mode 100644
index 00000000000..78e4d49bbd8
--- /dev/null
+++ b/mmdet/models/dense_heads/mask2former_head.py
@@ -0,0 +1,430 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, build_plugin_layer, caffe2_xavier_init
+from mmcv.cnn.bricks.transformer import (build_positional_encoding,
+                                         build_transformer_layer_sequence)
+from mmcv.ops import point_sample
+from mmcv.runner import ModuleList
+
+from mmdet.core import build_assigner, build_sampler, reduce_mean
+from mmdet.models.utils import get_uncertain_point_coords_with_randomness
+from ..builder import HEADS, build_loss
+from .anchor_free_head import AnchorFreeHead
+from .maskformer_head import MaskFormerHead
+
+
+@HEADS.register_module()
+class Mask2FormerHead(MaskFormerHead):
+    """Implements the Mask2Former head.
+
+    See `Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for features.
+        out_channels (int): Number of channels for output.
+        num_things_classes (int): Number of things.
+        num_stuff_classes (int): Number of stuff.
+        num_queries (int): Number of query in Transformer decoder.
+        pixel_decoder (:obj:`mmcv.ConfigDict` | dict): Config for pixel
+            decoder. Defaults to None.
+        enforce_decoder_input_project (bool, optional): Whether to add
+            a layer to change the embed_dim of tranformer encoder in
+            pixel decoder to the embed_dim of transformer decoder.
+            Defaults to False.
+        transformer_decoder (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer decoder. Defaults to None.
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer decoder position encoding. Defaults to None.
+        loss_cls (:obj:`mmcv.ConfigDict` | dict): Config of the classification
+            loss. Defaults to None.
+        loss_mask (:obj:`mmcv.ConfigDict` | dict): Config of the mask loss.
+            Defaults to None.
+        loss_dice (:obj:`mmcv.ConfigDict` | dict): Config of the dice loss.
+            Defaults to None.
+        train_cfg (:obj:`mmcv.ConfigDict` | dict): Training config of
+            Mask2Former head.
+        test_cfg (:obj:`mmcv.ConfigDict` | dict): Testing config of
+            Mask2Former head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 feat_channels,
+                 out_channels,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 num_queries=100,
+                 num_transformer_feat_level=3,
+                 pixel_decoder=None,
+                 enforce_decoder_input_project=False,
+                 transformer_decoder=None,
+                 positional_encoding=None,
+                 loss_cls=None,
+                 loss_mask=None,
+                 loss_dice=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(AnchorFreeHead, self).__init__(init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = self.num_things_classes + self.num_stuff_classes
+        self.num_queries = num_queries
+        self.num_transformer_feat_level = num_transformer_feat_level
+        self.num_heads = transformer_decoder.transformerlayers.\
+            attn_cfgs.num_heads
+        self.num_transformer_decoder_layers = transformer_decoder.num_layers
+        assert pixel_decoder.encoder.transformerlayers.\
+            attn_cfgs.num_levels == num_transformer_feat_level
+        pixel_decoder_ = copy.deepcopy(pixel_decoder)
+        pixel_decoder_.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = build_plugin_layer(pixel_decoder_)[1]
+        self.transformer_decoder = build_transformer_layer_sequence(
+            transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+
+        self.decoder_input_projs = ModuleList()
+        # from low resolution to high resolution
+        for _ in range(num_transformer_feat_level):
+            if (self.decoder_embed_dims != feat_channels
+                    or enforce_decoder_input_project):
+                self.decoder_input_projs.append(
+                    Conv2d(
+                        feat_channels, self.decoder_embed_dims, kernel_size=1))
+            else:
+                self.decoder_input_projs.append(nn.Identity())
+        self.decoder_positional_encoding = build_positional_encoding(
+            positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, feat_channels)
+        self.query_feat = nn.Embedding(self.num_queries, feat_channels)
+        # from low resolution to high resolution
+        self.level_embed = nn.Embedding(self.num_transformer_feat_level,
+                                        feat_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            self.sampler = build_sampler(self.train_cfg.sampler, context=self)
+            self.num_points = self.train_cfg.get('num_points', 12544)
+            self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0)
+            self.importance_sample_ratio = self.train_cfg.get(
+                'importance_sample_ratio', 0.75)
+
+        self.class_weight = loss_cls.class_weight
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_mask = build_loss(loss_mask)
+        self.loss_dice = build_loss(loss_dice)
+
+    def init_weights(self):
+        for m in self.decoder_input_projs:
+            if isinstance(m, Conv2d):
+                caffe2_xavier_init(m, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+    def _get_target_single(self, cls_score, mask_pred, gt_labels, gt_masks,
+                           img_metas):
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_labels (Tensor): Ground truth class indices for one image with
+                shape (num_gts, ).
+            gt_masks (Tensor): Ground truth mask for each image, each with
+                shape (num_gts, h, w).
+            img_metas (dict): Image informtation.
+
+        Returns:
+            tuple[Tensor]: A tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image. \
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image. \
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image. \
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image. \
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each \
+                    image.
+                - neg_inds (Tensor): Sampled negative indices for each \
+                    image.
+        """
+        # sample points
+        num_queries = cls_score.shape[0]
+        num_gts = gt_labels.shape[0]
+
+        point_coords = torch.rand((1, self.num_points, 2),
+                                  device=cls_score.device)
+        # shape (num_queries, num_points)
+        mask_points_pred = point_sample(
+            mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1,
+                                                        1)).squeeze(1)
+        # shape (num_gts, num_points)
+        gt_points_masks = point_sample(
+            gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1,
+                                                               1)).squeeze(1)
+
+        # assign and sample
+        assign_result = self.assigner.assign(cls_score, mask_points_pred,
+                                             gt_labels, gt_points_masks,
+                                             img_metas)
+        sampling_result = self.sampler.sample(assign_result, mask_pred,
+                                              gt_masks)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones((self.num_queries, ))
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds)
+
+    def loss_single(self, cls_scores, mask_preds, gt_labels_list,
+                    gt_masks_list, img_metas):
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, h, w).
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image, each with shape (num_gts, ).
+            gt_masks_list (list[Tensor]): Ground truth mask for each image,
+                each with shape (num_gts, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single \
+                decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         num_total_pos,
+         num_total_neg) = self.get_targets(cls_scores_list, mask_preds_list,
+                                           gt_labels_list, gt_masks_list,
+                                           img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([num_total_pos]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        with torch.no_grad():
+            points_coords = get_uncertain_point_coords_with_randomness(
+                mask_preds.unsqueeze(1), None, self.num_points,
+                self.oversample_ratio, self.importance_sample_ratio)
+            # shape (num_total_gts, h, w) -> (num_total_gts, num_points)
+            mask_point_targets = point_sample(
+                mask_targets.unsqueeze(1).float(), points_coords).squeeze(1)
+        # shape (num_queries, h, w) -> (num_queries, num_points)
+        mask_point_preds = point_sample(
+            mask_preds.unsqueeze(1), points_coords).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_point_preds, mask_point_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # shape (num_queries, num_points) -> (num_queries * num_points, )
+        mask_point_preds = mask_point_preds.reshape(-1)
+        # shape (num_total_gts, num_points) -> (num_total_gts * num_points, )
+        mask_point_targets = mask_point_targets.reshape(-1)
+        loss_mask = self.loss_mask(
+            mask_point_preds,
+            mask_point_targets,
+            avg_factor=num_total_masks * self.num_points)
+
+        return loss_cls, loss_mask, loss_dice
+
+    def forward_head(self, decoder_out, mask_feature, attn_mask_target_size):
+        """Forward for head part which is called after every decoder layer.
+
+        Args:
+            decoder_out (Tensor): in shape (num_queries, batch_size, c).
+            mask_feature (Tensor): in shape (batch_size, c, h, w).
+            attn_mask_target_size (tuple[int, int]): target attention
+                mask size.
+
+        Returns:
+            tuple: A tuple contain three elements.
+
+            - cls_pred (Tensor): Classification scores in shape \
+                (batch_size, num_queries, cls_out_channels). \
+                Note `cls_out_channels` should includes background.
+            - mask_pred (Tensor): Mask scores in shape \
+                (batch_size, num_queries,h, w).
+            - attn_mask (Tensor): Attention mask in shape \
+                (batch_size * num_heads, num_queries, h, w).
+        """
+        decoder_out = self.transformer_decoder.post_norm(decoder_out)
+        decoder_out = decoder_out.transpose(0, 1)
+        # shape (num_queries, batch_size, c)
+        cls_pred = self.cls_embed(decoder_out)
+        # shape (num_queries, batch_size, c)
+        mask_embed = self.mask_embed(decoder_out)
+        # shape (num_queries, batch_size, h, w)
+        mask_pred = torch.einsum('bqc,bchw->bqhw', mask_embed, mask_feature)
+        attn_mask = F.interpolate(
+            mask_pred,
+            attn_mask_target_size,
+            mode='bilinear',
+            align_corners=False)
+        # shape (num_queries, batch_size, h, w) ->
+        #   (batch_size * num_head, num_queries, h, w)
+        attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat(
+            (1, self.num_heads, 1, 1)).flatten(0, 1)
+        attn_mask = attn_mask.sigmoid() < 0.5
+        attn_mask = attn_mask.detach()
+
+        return cls_pred, mask_pred, attn_mask
+
+    def forward(self, feats, img_metas):
+        """Forward function.
+
+        Args:
+            feats (list[Tensor]): Multi scale Features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple: A tuple contains two elements.
+
+            - cls_pred_list (list[Tensor)]: Classification logits \
+                for each decoder layer. Each is a 3D-tensor with shape \
+                (batch_size, num_queries, cls_out_channels). \
+                Note `cls_out_channels` should includes background.
+            - mask_pred_list (list[Tensor]): Mask logits for each \
+                decoder layer. Each with shape (batch_size, num_queries, \
+                 h, w).
+        """
+        batch_size = len(img_metas)
+        mask_features, multi_scale_memorys = self.pixel_decoder(feats)
+        # multi_scale_memorys (from low resolution to high resolution)
+        decoder_inputs = []
+        decoder_positional_encodings = []
+        for i in range(self.num_transformer_feat_level):
+            decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i])
+            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+            decoder_input = decoder_input.flatten(2).permute(2, 0, 1)
+            level_embed = self.level_embed.weight[i].view(1, 1, -1)
+            decoder_input = decoder_input + level_embed
+            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+            mask = decoder_input.new_zeros(
+                (batch_size, ) + multi_scale_memorys[i].shape[-2:],
+                dtype=torch.bool)
+            decoder_positional_encoding = self.decoder_positional_encoding(
+                mask)
+            decoder_positional_encoding = decoder_positional_encoding.flatten(
+                2).permute(2, 0, 1)
+            decoder_inputs.append(decoder_input)
+            decoder_positional_encodings.append(decoder_positional_encoding)
+        # shape (num_queries, c) -> (num_queries, batch_size, c)
+        query_feat = self.query_feat.weight.unsqueeze(1).repeat(
+            (1, batch_size, 1))
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat(
+            (1, batch_size, 1))
+
+        cls_pred_list = []
+        mask_pred_list = []
+        cls_pred, mask_pred, attn_mask = self.forward_head(
+            query_feat, mask_features, multi_scale_memorys[0].shape[-2:])
+        cls_pred_list.append(cls_pred)
+        mask_pred_list.append(mask_pred)
+
+        for i in range(self.num_transformer_decoder_layers):
+            level_idx = i % self.num_transformer_feat_level
+            # if a mask is all True(all background), then set it all False.
+            attn_mask[torch.where(
+                attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+
+            # cross_attn + self_attn
+            layer = self.transformer_decoder.layers[i]
+            attn_masks = [attn_mask, None]
+            query_feat = layer(
+                query=query_feat,
+                key=decoder_inputs[level_idx],
+                value=decoder_inputs[level_idx],
+                query_pos=query_embed,
+                key_pos=decoder_positional_encodings[level_idx],
+                attn_masks=attn_masks,
+                query_key_padding_mask=None,
+                # here we do not apply masking on padded region
+                key_padding_mask=None)
+            cls_pred, mask_pred, attn_mask = self.forward_head(
+                query_feat, mask_features, multi_scale_memorys[
+                    (i + 1) % self.num_transformer_feat_level].shape[-2:])
+
+            cls_pred_list.append(cls_pred)
+            mask_pred_list.append(mask_pred)
+
+        return cls_pred_list, mask_pred_list
diff --git a/mmdet/models/detectors/__init__.py b/mmdet/models/detectors/__init__.py
index 9f05a282c18..5f2b3088de4 100644
--- a/mmdet/models/detectors/__init__.py
+++ b/mmdet/models/detectors/__init__.py
@@ -17,6 +17,7 @@
 from .htc import HybridTaskCascade
 from .kd_one_stage import KnowledgeDistillationSingleStageDetector
 from .lad import LAD
+from .mask2former import Mask2Former
 from .mask_rcnn import MaskRCNN
 from .mask_scoring_rcnn import MaskScoringRCNN
 from .maskformer import MaskFormer
@@ -51,5 +52,5 @@
     'VFNet', 'DETR', 'TridentFasterRCNN', 'SparseRCNN', 'SCNet', 'SOLO',
     'DeformableDETR', 'AutoAssign', 'YOLOF', 'CenterNet', 'YOLOX',
     'TwoStagePanopticSegmentor', 'PanopticFPN', 'QueryInst', 'LAD', 'TOOD',
-    'MaskFormer'
+    'MaskFormer', 'Mask2Former'
 ]
diff --git a/mmdet/models/detectors/mask2former.py b/mmdet/models/detectors/mask2former.py
new file mode 100644
index 00000000000..b9ad2ed25d3
--- /dev/null
+++ b/mmdet/models/detectors/mask2former.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .maskformer import MaskFormer
+
+
+@DETECTORS.register_module()
+class Mask2Former(MaskFormer):
+    r"""Implementation of `Masked-attention Mask
+    Transformer for Universal Image Segmentation
+    <https://arxiv.org/pdf/2112.01527>`_."""
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 panoptic_head=None,
+                 panoptic_fusion_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None):
+        super().__init__(
+            backbone,
+            neck=neck,
+            panoptic_head=panoptic_head,
+            panoptic_fusion_head=panoptic_fusion_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
diff --git a/tests/test_models/test_dense_heads/test_mask2former_head.py b/tests/test_models/test_dense_heads/test_mask2former_head.py
new file mode 100644
index 00000000000..66d144301b2
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_mask2former_head.py
@@ -0,0 +1,216 @@
+import numpy as np
+import torch
+from mmcv import ConfigDict
+
+from mmdet.core.mask import BitmapMasks
+from mmdet.models.dense_heads import Mask2FormerHead
+
+
+def test_mask2former_head_loss():
+    """Tests head loss when truth is empty and non-empty."""
+    base_channels = 64
+    img_metas = [{
+        'batch_input_shape': (128, 160),
+        'img_shape': (126, 160, 3),
+        'ori_shape': (63, 80, 3)
+    }, {
+        'batch_input_shape': (128, 160),
+        'img_shape': (120, 160, 3),
+        'ori_shape': (60, 80, 3)
+    }]
+    feats = [
+        torch.rand((2, 64 * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i)))
+        for i in range(4)
+    ]
+    num_things_classes = 80
+    num_stuff_classes = 53
+    num_classes = num_things_classes + num_stuff_classes
+    config = ConfigDict(
+        dict(
+            type='Mask2FormerHead',
+            in_channels=[base_channels * 2**i for i in range(4)],
+            feat_channels=base_channels,
+            out_channels=base_channels,
+            num_things_classes=num_things_classes,
+            num_stuff_classes=num_stuff_classes,
+            num_queries=100,
+            num_transformer_feat_level=3,
+            pixel_decoder=dict(
+                type='MSDeformAttnPixelDecoder',
+                num_outs=3,
+                norm_cfg=dict(type='GN', num_groups=32),
+                act_cfg=dict(type='ReLU'),
+                encoder=dict(
+                    type='DetrTransformerEncoder',
+                    num_layers=6,
+                    transformerlayers=dict(
+                        type='BaseTransformerLayer',
+                        attn_cfgs=dict(
+                            type='MultiScaleDeformableAttention',
+                            embed_dims=base_channels,
+                            num_heads=8,
+                            num_levels=3,
+                            num_points=4,
+                            im2col_step=64,
+                            dropout=0.0,
+                            batch_first=False,
+                            norm_cfg=None,
+                            init_cfg=None),
+                        ffn_cfgs=dict(
+                            type='FFN',
+                            embed_dims=base_channels,
+                            feedforward_channels=base_channels * 4,
+                            num_fcs=2,
+                            ffn_drop=0.0,
+                            act_cfg=dict(type='ReLU', inplace=True)),
+                        feedforward_channels=base_channels * 4,
+                        ffn_dropout=0.0,
+                        operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                    init_cfg=None),
+                positional_encoding=dict(
+                    type='SinePositionalEncoding',
+                    num_feats=base_channels // 2,
+                    normalize=True),
+                init_cfg=None),
+            enforce_decoder_input_project=False,
+            positional_encoding=dict(
+                type='SinePositionalEncoding',
+                num_feats=base_channels // 2,
+                normalize=True),
+            transformer_decoder=dict(
+                type='DetrTransformerDecoder',
+                return_intermediate=True,
+                num_layers=9,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=dict(
+                        type='MultiheadAttention',
+                        embed_dims=base_channels,
+                        num_heads=8,
+                        attn_drop=0.0,
+                        proj_drop=0.0,
+                        dropout_layer=None,
+                        batch_first=False),
+                    ffn_cfgs=dict(
+                        embed_dims=base_channels,
+                        feedforward_channels=base_channels * 8,
+                        num_fcs=2,
+                        act_cfg=dict(type='ReLU', inplace=True),
+                        ffn_drop=0.0,
+                        dropout_layer=None,
+                        add_identity=True),
+                    # the following parameter was not used,
+                    # just make current api happy
+                    feedforward_channels=base_channels * 8,
+                    operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
+                                     'ffn', 'norm')),
+                init_cfg=None),
+            loss_cls=dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=2.0,
+                reduction='mean',
+                class_weight=[1.0] * num_classes + [0.1]),
+            loss_mask=dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='mean',
+                loss_weight=5.0),
+            loss_dice=dict(
+                type='DiceLoss',
+                use_sigmoid=True,
+                activate=True,
+                reduction='mean',
+                naive_dice=True,
+                eps=1.0,
+                loss_weight=5.0),
+            train_cfg=dict(
+                num_points=256,
+                oversample_ratio=3.0,
+                importance_sample_ratio=0.75,
+                assigner=dict(
+                    type='MaskHungarianAssigner',
+                    cls_cost=dict(type='ClassificationCost', weight=2.0),
+                    mask_cost=dict(
+                        type='CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dice_cost=dict(
+                        type='DiceCost', weight=5.0, pred_act=True, eps=1.0)),
+                sampler=dict(type='MaskPseudoSampler')),
+            test_cfg=dict(
+                panoptic_on=True,
+                semantic_on=False,
+                instance_on=True,
+                max_dets_per_image=100,
+                object_mask_thr=0.8,
+                iou_thr=0.8)))
+    self = Mask2FormerHead(**config)
+    self.init_weights()
+    all_cls_scores, all_mask_preds = self.forward(feats, img_metas)
+    # Test that empty ground truth encourages the network to predict background
+    gt_labels_list = [torch.LongTensor([]), torch.LongTensor([])]
+    gt_masks_list = [
+        torch.zeros((0, 128, 160)).long(),
+        torch.zeros((0, 128, 160)).long()
+    ]
+
+    empty_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list,
+                                gt_masks_list, img_metas)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no mask loss.
+    for key, loss in empty_gt_losses.items():
+        if 'cls' in key:
+            assert loss.item() > 0, 'cls loss should be non-zero'
+        elif 'mask' in key:
+            assert loss.item(
+            ) == 0, 'there should be no mask loss when there are no true mask'
+        elif 'dice' in key:
+            assert loss.item(
+            ) == 0, 'there should be no dice loss when there are no true mask'
+
+    # when truth is non-empty then both cls, mask, dice loss should be nonzero
+    # random inputs
+    gt_labels_list = [
+        torch.tensor([10, 100]).long(),
+        torch.tensor([100, 10]).long()
+    ]
+    mask1 = torch.zeros((2, 128, 160)).long()
+    mask1[0, :50] = 1
+    mask1[1, 50:] = 1
+    mask2 = torch.zeros((2, 128, 160)).long()
+    mask2[0, :, :50] = 1
+    mask2[1, :, 50:] = 1
+    gt_masks_list = [mask1, mask2]
+    two_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list,
+                              gt_masks_list, img_metas)
+    for loss in two_gt_losses.values():
+        assert loss.item() > 0, 'all loss should be non-zero'
+
+    # test forward_train
+    gt_bboxes = None
+    gt_labels = [
+        torch.tensor([10]).long(),
+        torch.tensor([10]).long(),
+    ]
+    thing_mask1 = np.zeros((1, 128, 160), dtype=np.int32)
+    thing_mask1[0, :50] = 1
+    thing_mask2 = np.zeros((1, 128, 160), dtype=np.int32)
+    thing_mask2[0, :, 50:] = 1
+    gt_masks = [
+        BitmapMasks(thing_mask1, 128, 160),
+        BitmapMasks(thing_mask2, 128, 160),
+    ]
+    stuff_mask1 = torch.zeros((1, 128, 160)).long()
+    stuff_mask1[0, :50] = 10
+    stuff_mask1[0, 50:] = 100
+    stuff_mask2 = torch.zeros((1, 128, 160)).long()
+    stuff_mask2[0, :, 50:] = 10
+    stuff_mask2[0, :, :50] = 100
+    gt_semantic_seg = [stuff_mask1, stuff_mask2]
+
+    self.forward_train(feats, img_metas, gt_bboxes, gt_labels, gt_masks,
+                       gt_semantic_seg)
+
+    # test inference mode
+    self.simple_test(feats, img_metas)
diff --git a/tests/test_models/test_forward.py b/tests/test_models/test_forward.py
index 6b28ba61514..3e5f80ba80f 100644
--- a/tests/test_models/test_forward.py
+++ b/tests/test_models/test_forward.py
@@ -811,3 +811,114 @@ def test_maskformer_forward():
                                       rescale=True,
                                       return_loss=False)
         batch_results.append(result)
+
+
+def test_mask2former_forward():
+    model_cfg = _get_detector_cfg(
+        'mask2former/mask2former_r50_lsj_8x2_50e_coco.py')
+    base_channels = 32
+    model_cfg.backbone.depth = 18
+    model_cfg.backbone.init_cfg = None
+    model_cfg.backbone.base_channels = base_channels
+    model_cfg.panoptic_head.in_channels = [
+        base_channels * 2**i for i in range(4)
+    ]
+    model_cfg.panoptic_head.feat_channels = base_channels
+    model_cfg.panoptic_head.out_channels = base_channels
+    model_cfg.panoptic_head.pixel_decoder.encoder.\
+        transformerlayers.attn_cfgs.embed_dims = base_channels
+    model_cfg.panoptic_head.pixel_decoder.encoder.\
+        transformerlayers.ffn_cfgs.embed_dims = base_channels
+    model_cfg.panoptic_head.pixel_decoder.encoder.\
+        transformerlayers.ffn_cfgs.feedforward_channels = base_channels * 4
+    model_cfg.panoptic_head.pixel_decoder.\
+        positional_encoding.num_feats = base_channels // 2
+    model_cfg.panoptic_head.positional_encoding.\
+        num_feats = base_channels // 2
+    model_cfg.panoptic_head.transformer_decoder.\
+        transformerlayers.attn_cfgs.embed_dims = base_channels
+    model_cfg.panoptic_head.transformer_decoder.\
+        transformerlayers.ffn_cfgs.embed_dims = base_channels
+    model_cfg.panoptic_head.transformer_decoder.\
+        transformerlayers.ffn_cfgs.feedforward_channels = base_channels * 8
+    model_cfg.panoptic_head.transformer_decoder.\
+        transformerlayers.feedforward_channels = base_channels * 8
+
+    from mmdet.core import BitmapMasks
+    from mmdet.models import build_detector
+    detector = build_detector(model_cfg)
+
+    # Test forward train with non-empty truth batch
+    detector.train()
+    img_metas = [
+        {
+            'batch_input_shape': (128, 160),
+            'img_shape': (126, 160, 3),
+            'ori_shape': (63, 80, 3),
+            'pad_shape': (128, 160, 3)
+        },
+    ]
+    img = torch.rand((1, 3, 128, 160))
+    gt_bboxes = None
+    gt_labels = [
+        torch.tensor([10]).long(),
+    ]
+    thing_mask1 = np.zeros((1, 128, 160), dtype=np.int32)
+    thing_mask1[0, :50] = 1
+    gt_masks = [
+        BitmapMasks(thing_mask1, 128, 160),
+    ]
+    stuff_mask1 = torch.zeros((1, 128, 160)).long()
+    stuff_mask1[0, :50] = 10
+    stuff_mask1[0, 50:] = 100
+    gt_semantic_seg = [
+        stuff_mask1,
+    ]
+    losses = detector.forward(
+        img=img,
+        img_metas=img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        gt_masks=gt_masks,
+        gt_semantic_seg=gt_semantic_seg,
+        return_loss=True)
+    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    assert float(loss.item()) > 0
+
+    # Test forward train with an empty truth batch
+    gt_bboxes = [
+        torch.empty((0, 4)).float(),
+    ]
+    gt_labels = [
+        torch.empty((0, )).long(),
+    ]
+    mask = np.zeros((0, 128, 160), dtype=np.uint8)
+    gt_masks = [
+        BitmapMasks(mask, 128, 160),
+    ]
+    gt_semantic_seg = [
+        torch.randint(0, 133, (0, 128, 160)),
+    ]
+    losses = detector.forward(
+        img,
+        img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        gt_masks=gt_masks,
+        gt_semantic_seg=gt_semantic_seg,
+        return_loss=True)
+    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    assert float(loss.item()) > 0
+
+    # Test forward test
+    detector.eval()
+    with torch.no_grad():
+        img_list = [g[None, :] for g in img]
+        batch_results = []
+        for one_img, one_meta in zip(img_list, img_metas):
+            result = detector.forward([one_img], [[one_meta]],
+                                      rescale=True,
+                                      return_loss=False)
+        batch_results.append(result)
diff --git a/tests/test_utils/test_assigner.py b/tests/test_utils/test_assigner.py
index 7728510b166..c40584a50fe 100644
--- a/tests/test_utils/test_assigner.py
+++ b/tests/test_utils/test_assigner.py
@@ -606,3 +606,27 @@ def test_mask_hungarian_match_assigner():
     assert torch.all(assign_result.gt_inds > -1)
     assert (assign_result.gt_inds > 0).sum() == gt_labels.size(0)
     assert (assign_result.labels > -1).sum() == gt_labels.size(0)
+
+    # test with mask bce mode
+    assigner_cfg = dict(
+        cls_cost=dict(type='ClassificationCost', weight=0.0),
+        mask_cost=dict(
+            type='CrossEntropyLossCost', weight=1.0, use_sigmoid=True),
+        dice_cost=dict(type='DiceCost', weight=0.0, pred_act=True, eps=1.0))
+    self = MaskHungarianAssigner(**assigner_cfg)
+    assign_result = self.assign(cls_pred, mask_pred, gt_labels, gt_masks,
+                                img_meta)
+    assert torch.all(assign_result.gt_inds > -1)
+    assert (assign_result.gt_inds > 0).sum() == gt_labels.size(0)
+    assert (assign_result.labels > -1).sum() == gt_labels.size(0)
+
+    # test with mask ce mode
+    assigner_cfg = dict(
+        cls_cost=dict(type='ClassificationCost', weight=0.0),
+        mask_cost=dict(
+            type='CrossEntropyLossCost', weight=1.0, use_sigmoid=False),
+        dice_cost=dict(type='DiceCost', weight=0.0, pred_act=True, eps=1.0))
+    self = MaskHungarianAssigner(**assigner_cfg)
+    with pytest.raises(NotImplementedError):
+        assign_result = self.assign(cls_pred, mask_pred, gt_labels, gt_masks,
+                                    img_meta)

From 5091eab7d34b2172c4d379695d6adeea896a609f Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Sun, 27 Mar 2022 01:00:00 +0800
Subject: [PATCH 36/42] [Fix] Update unit test of CrossEntropyCost (#7537)

---
 tests/test_utils/test_assigner.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/test_utils/test_assigner.py b/tests/test_utils/test_assigner.py
index c40584a50fe..0124c3b3fd7 100644
--- a/tests/test_utils/test_assigner.py
+++ b/tests/test_utils/test_assigner.py
@@ -620,13 +620,11 @@ def test_mask_hungarian_match_assigner():
     assert (assign_result.gt_inds > 0).sum() == gt_labels.size(0)
     assert (assign_result.labels > -1).sum() == gt_labels.size(0)
 
-    # test with mask ce mode
+    # test with ce mode of CrossEntropyLossCost which is not supported yet
     assigner_cfg = dict(
         cls_cost=dict(type='ClassificationCost', weight=0.0),
         mask_cost=dict(
             type='CrossEntropyLossCost', weight=1.0, use_sigmoid=False),
         dice_cost=dict(type='DiceCost', weight=0.0, pred_act=True, eps=1.0))
-    self = MaskHungarianAssigner(**assigner_cfg)
-    with pytest.raises(NotImplementedError):
-        assign_result = self.assign(cls_pred, mask_pred, gt_labels, gt_masks,
-                                    img_meta)
+    with pytest.raises(AssertionError):
+        self = MaskHungarianAssigner(**assigner_cfg)

From 664e7befe58e70ae6ffefb56d17a473fd102bb30 Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Sun, 27 Mar 2022 01:01:21 +0800
Subject: [PATCH 37/42] [Fix] Memory leaking in panpotic segmentation
 evaluation (#7538)

---
 .../mask2former/mask2former_r50_lsj_8x2_50e_coco.py  | 12 ++++++------
 mmdet/datasets/api_wrappers/panoptic_evaluation.py   |  7 +++++++
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py
index 54d138fce3f..2c23625e139 100644
--- a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py
+++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py
@@ -238,16 +238,16 @@
         dict(type='TextLoggerHook', by_epoch=False),
         dict(type='TensorboardLoggerHook', by_epoch=False)
     ])
-interval = 200000
+interval = 5000
 workflow = [('train', interval)]
 checkpoint_config = dict(
     by_epoch=False, interval=interval, save_last=True, max_keep_ckpts=3)
 
-# Before 365001th iteration, we do evaluation every 200000 iterations.
+# Before 365001th iteration, we do evaluation every 5000 iterations.
 # After 365000th iteration, we do evaluation every 368750 iterations,
-# which means do evaluation at the end of training.
-# In all, we do evaluation at the 200000th iteration and the
-# last iteratoin.
+# which means that we do evaluation at the end of training.
 dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)]
 evaluation = dict(
-    interval=interval, dynamic_intervals=dynamic_intervals, metric='PQ')
+    interval=interval,
+    dynamic_intervals=dynamic_intervals,
+    metric=['PQ', 'bbox', 'segm'])
diff --git a/mmdet/datasets/api_wrappers/panoptic_evaluation.py b/mmdet/datasets/api_wrappers/panoptic_evaluation.py
index 49850e5d52b..b29d5007993 100644
--- a/mmdet/datasets/api_wrappers/panoptic_evaluation.py
+++ b/mmdet/datasets/api_wrappers/panoptic_evaluation.py
@@ -211,7 +211,14 @@ def pq_compute_multi_core(matched_annotations_list,
                                 (proc_id, annotation_set, gt_folder,
                                  pred_folder, categories, file_client))
         processes.append(p)
+
+    # Close the process pool, otherwise it will lead to memory
+    # leaking problems.
+    workers.close()
+    workers.join()
+
     pq_stat = PQStat()
     for p in processes:
         pq_stat += p.get()
+
     return pq_stat

From f86310028d6ace5049ada105d46066be7a6a9c67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Haian=20Huang=28=E6=B7=B1=E5=BA=A6=E7=9C=B8=29?=
 <1286304229@qq.com>
Date: Mon, 28 Mar 2022 19:17:40 +0800
Subject: [PATCH 38/42] fix broadcast shape bug in yolov3 (#7551)

---
 mmdet/models/losses/cross_entropy_loss.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mmdet/models/losses/cross_entropy_loss.py b/mmdet/models/losses/cross_entropy_loss.py
index 97f12e50375..41411fc5456 100644
--- a/mmdet/models/losses/cross_entropy_loss.py
+++ b/mmdet/models/losses/cross_entropy_loss.py
@@ -123,7 +123,10 @@ def binary_cross_entropy(pred,
         # should mask out the ignored elements
         valid_mask = ((label >= 0) & (label != ignore_index)).float()
         if weight is not None:
-            weight *= valid_mask
+            # The inplace writing method will have a mismatched broadcast
+            # shape error if the weight and valid_mask dimensions
+            # are inconsistent such as (B,N,1) and (B,N,C).
+            weight = weight * valid_mask
         else:
             weight = valid_mask
 

From 7d1ce22e3328ba89c11b6cdaafff6c96d9da3f4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Haian=20Huang=28=E6=B7=B1=E5=BA=A6=E7=9C=B8=29?=
 <1286304229@qq.com>
Date: Mon, 28 Mar 2022 19:18:02 +0800
Subject: [PATCH 39/42] Fix `pointrend` missing `get_uncertainty` function bug
 (#7550)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [Fix] Adjust the order of get_classes and FileClient. (#7276)

* delete -sv (#7277)

Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com>

* [Docs] Add Chinese version of finetune (#7178)

* [Fix] Fix wrong img name in onnx2tensorrt.py (#7157)

* [Docs] fix albumentations installed way (#7143)

* Update finetune.md

Translate the finetune.md doc to Chinese

* Update finetune.md

* Update finetune.md

* Update finetune.md

* fix lint

* fx lint

* fix pr

Co-authored-by: Jamie <jamiechoi1995@users.noreply.github.com>
Co-authored-by: BigDong <yudongwang@tju.edu.cn>

* set unmap_results=True in ssd_head (#7328)

* Update YOLOX log for non square input (#7235)

* [Enhance] add cpu_num in cocopanoptic for pq computing (#7315)

* add cpu_num in cocopanoptic for pq computing

* cpu_num -> nproc

* move nproc to evaluate

* [Enhancement] Allow to set channel_order in LoadImageFromFile (#7258)

* allow to set channel_order when loading images

* fix lint

* fix unit test

* fix lint

* [Fix] Force the inputs of `get_bboxes` in yolox_head to float32. (#7324)

* Fix softnms bug

* Add force_fp32 in corner_head and centripetal_head

* [Fix] Fix typo in FPN neck (#7347)

* update readme and pretrained related (#7301)

* [Docs] Add Chinese version of onnx2tensorrt.md (#7219)

* Fix bug of docs

* translate onnx2tensorrt.md

* fix

* fix end-of-file-fixer

* fix some bugs

* 修复链接跳转

* 修复链接跳转

* 修复链接跳转-测试1

* 修复链接跳转-测试2

* 修复链接跳转-测试2

* 修复链接跳转-测试3

* 修复链接跳转-测试5

* Fix

Co-authored-by: jbwang1997 <jbwang1997@gmail.com>

* Update useful_tools.md (#7180)

* [Enhancement]: Update colab tutorials (#7310)

* update colab tutorials

* update

* fix

* fix wrong CUDA explaination

* resolve comments

* resolve comments

* fix typo

Co-authored-by: Cedric Luo <luochunhua1996@outlook.com>
Co-authored-by: tripleMu <92794867+q3394101@users.noreply.github.com>
Co-authored-by: jbwang1997 <jbwang1997@gmail.com>
Co-authored-by: kira <39787375+yangrisheng@users.noreply.github.com>
Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com>

* Fix pointrend missing get_uncertainty function bug

Co-authored-by: Wencheng Wu <41542251+274869388@users.noreply.github.com>
Co-authored-by: Yue Zhou <592267829@qq.com>
Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com>
Co-authored-by: MingJian.L <45811724+matrixgame2018@users.noreply.github.com>
Co-authored-by: Jamie <jamiechoi1995@users.noreply.github.com>
Co-authored-by: BigDong <yudongwang@tju.edu.cn>
Co-authored-by: Cedric Luo <26483343+chhluo@users.noreply.github.com>
Co-authored-by: Yosuke Shinya <42844407+shinya7y@users.noreply.github.com>
Co-authored-by: Cedric Luo <luochunhua1996@outlook.com>
Co-authored-by: Jingwei Zhang <zjw18@mails.tsinghua.edu.cn>
Co-authored-by: jbwang1997 <jbwang1997@gmail.com>
Co-authored-by: Xiangxu-0103 <xuxiang0103@gmail.com>
Co-authored-by: tripleMu <92794867+q3394101@users.noreply.github.com>
Co-authored-by: kira <39787375+yangrisheng@users.noreply.github.com>
---
 mmdet/models/roi_heads/mask_heads/mask_point_head.py | 5 +++--
 mmdet/models/utils/__init__.py                       | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/mmdet/models/roi_heads/mask_heads/mask_point_head.py b/mmdet/models/roi_heads/mask_heads/mask_point_head.py
index c022f1fdbc7..c77c46d2c6f 100644
--- a/mmdet/models/roi_heads/mask_heads/mask_point_head.py
+++ b/mmdet/models/roi_heads/mask_heads/mask_point_head.py
@@ -8,7 +8,8 @@
 from mmcv.runner import BaseModule
 
 from mmdet.models.builder import HEADS, build_loss
-from mmdet.models.utils import get_uncertain_point_coords_with_randomness
+from mmdet.models.utils import (get_uncertain_point_coords_with_randomness,
+                                get_uncertainty)
 
 
 @HEADS.register_module()
@@ -230,7 +231,7 @@ def get_roi_rel_points_test(self, mask_pred, pred_label, cfg):
                 most uncertain points from the [mask_height, mask_width] grid .
         """
         num_points = cfg.subdivision_num_points
-        uncertainty_map = self._get_uncertainty(mask_pred, pred_label)
+        uncertainty_map = get_uncertainty(mask_pred, pred_label)
         num_rois, _, mask_height, mask_width = uncertainty_map.shape
 
         # During ONNX exporting, the type of each elements of 'shape' is
diff --git a/mmdet/models/utils/__init__.py b/mmdet/models/utils/__init__.py
index 6d9c4057a39..e74ba89e8c2 100644
--- a/mmdet/models/utils/__init__.py
+++ b/mmdet/models/utils/__init__.py
@@ -10,7 +10,8 @@
 from .misc import interpolate_as, sigmoid_geometric_mean
 from .normed_predictor import NormedConv2d, NormedLinear
 from .panoptic_gt_processing import preprocess_panoptic_gt
-from .point_sample import get_uncertain_point_coords_with_randomness
+from .point_sample import (get_uncertain_point_coords_with_randomness,
+                           get_uncertainty)
 from .positional_encoding import (LearnedPositionalEncoding,
                                   SinePositionalEncoding)
 from .res_layer import ResLayer, SimplifiedBasicBlock
@@ -29,5 +30,5 @@
     'adaptive_avg_pool2d', 'AdaptiveAvgPool2d', 'PatchEmbed', 'nchw_to_nlc',
     'nlc_to_nchw', 'pvt_convert', 'sigmoid_geometric_mean',
     'preprocess_panoptic_gt', 'DyReLU',
-    'get_uncertain_point_coords_with_randomness'
+    'get_uncertain_point_coords_with_randomness', 'get_uncertainty'
 ]

From cfeb39a9ad72b3d39dcb1e9ddd26de59b16457e1 Mon Sep 17 00:00:00 2001
From: Kevin Ye <1752391457@qq.com>
Date: Mon, 28 Mar 2022 19:21:44 +0800
Subject: [PATCH 40/42] fix docstring in bbox_nms.py (#7547)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Create init_cfg.md

* Update docs/zh_cn/tutorials/init_cfg.md

Co-authored-by: Haian Huang(深度眸) <1286304229@qq.com>

* update init_cfg.md

* update init_cfg.md

* update init_cfg.md

* update init_cfg.md

* fix docstring

* fix docstring

Co-authored-by: Haian Huang(深度眸) <1286304229@qq.com>
---
 mmdet/core/post_processing/bbox_nms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mmdet/core/post_processing/bbox_nms.py b/mmdet/core/post_processing/bbox_nms.py
index f16a1e77306..4fcf57bb501 100644
--- a/mmdet/core/post_processing/bbox_nms.py
+++ b/mmdet/core/post_processing/bbox_nms.py
@@ -20,7 +20,7 @@ def multiclass_nms(multi_bboxes,
             contains scores of the background class, but this will be ignored.
         score_thr (float): bbox threshold, bboxes with scores lower than it
             will not be considered.
-        nms_thr (float): NMS IoU threshold
+        nms_cfg (dict): a dict that contains the arguments of nms operations
         max_num (int, optional): if there are more than max_num bboxes after
             NMS, only top max_num will be kept. Default to -1.
         score_factors (Tensor, optional): The factors multiplied to scores

From a3d3dd1a988025c6a99116118bdde3a6dada4a3b Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Mon, 28 Mar 2022 21:24:02 +0800
Subject: [PATCH 41/42] [Doc] Add doc for COCO panoptic segmentation test
 results submission  (#7430)

* [Doc] Add doc for coco panoptic segmentation test results submission

* add example

* add script

* update comments

* update

* update

* update

* update

* update
---
 docs/en/tutorials/test_results_submission.md | 112 +++++++++++++++++++
 tools/misc/gen_coco_panoptic_test_info.py    |  34 ++++++
 2 files changed, 146 insertions(+)
 create mode 100644 docs/en/tutorials/test_results_submission.md
 create mode 100644 tools/misc/gen_coco_panoptic_test_info.py

diff --git a/docs/en/tutorials/test_results_submission.md b/docs/en/tutorials/test_results_submission.md
new file mode 100644
index 00000000000..7f7d5310e31
--- /dev/null
+++ b/docs/en/tutorials/test_results_submission.md
@@ -0,0 +1,112 @@
+# Tutorial 11: Test Results Submission
+
+## Panoptic segmentation test results submission
+
+The following sections introduce how to produce the prediction results of panoptic segmentation models on the COCO test-dev set and submit the predictions to [COCO evaluation server](https://competitions.codalab.org/competitions/19507).
+
+### Prerequisites
+
+- Download [COCO test dataset images](http://images.cocodataset.org/zips/test2017.zip), [testing image info](http://images.cocodataset.org/annotations/image_info_test2017.zip), and [panoptic train/val annotations](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip), then unzip them, put 'test2017' to `data/coco/`, put json files and annotation files to `data/coco/annotations/`.
+
+```shell
+# suppose data/coco/ does not exist
+mkdir -pv data/coco/
+
+# download test2017
+wget -P data/coco/ http://images.cocodataset.org/zips/test2017.zip
+wget -P data/coco/ http://images.cocodataset.org/annotations/image_info_test2017.zip
+wget -P data/coco/ http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip
+
+# unzip them
+unzip data/coco/test2017.zip -d data/coco/
+unzip data/coco/image_info_test2017.zip -d data/coco/
+unzip data/coco/panoptic_annotations_trainval2017.zip -d data/coco/
+
+# remove zip files (optional)
+rm -rf data/coco/test2017.zip data/coco/image_info_test2017.zip data/coco/panoptic_annotations_trainval2017.zip
+```
+
+- Run the following code to update category information in testing image info. Since the attribute `isthing` is missing in category information of 'image_info_test-dev2017.json', we need to update it with the category information in 'panoptic_val2017.json'.
+
+```shell
+python tools/misc/gen_coco_panoptic_test_info.py data/coco/annotations
+```
+
+After completing the above preparations, your directory structure of `data` should be like this:
+
+```text
+data
+`-- coco
+    |-- annotations
+    |   |-- image_info_test-dev2017.json
+    |   |-- image_info_test2017.json
+    |   |-- panoptic_image_info_test-dev2017.json
+    |   |-- panoptic_train2017.json
+    |   |-- panoptic_train2017.zip
+    |   |-- panoptic_val2017.json
+    |   `-- panoptic_val2017.zip
+    `-- test2017
+```
+
+### Inference on coco test-dev
+
+The commands to perform inference on test2017 are as below:
+
+```shell
+# test with single gpu
+CUDA_VISIBLE_DEVICES=0 python tools/test.py \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    --format-only \
+    --cfg-options data.test.ann_file=data/coco/annotations/panoptic_image_info_test-dev2017.json data.test.img_prefix=data/coco/test2017 \
+    --eval-options jsonfile_prefix=${WORK_DIR}/results
+
+# test with four gpus
+CUDA_VISIBLE_DEVICES=0,1,3,4 bash tools/dist_test.sh \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    4 \ # four gpus
+    --format-only \
+    --cfg-options data.test.ann_file=data/coco/annotations/panoptic_image_info_test-dev2017.json data.test.img_prefix=data/coco/test2017 \
+    --eval-options jsonfile_prefix=${WORK_DIR}/results
+
+# test with slurm
+GPUS=8 tools/slurm_test.sh \
+    ${Partition} \
+    ${JOB_NAME} \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    --format-only \
+    --cfg-options data.test.ann_file=data/coco/annotations/panoptic_image_info_test-dev2017.json data.test.img_prefix=data/coco/test2017 \
+    --eval-options jsonfile_prefix=${WORK_DIR}/results
+```
+
+Example
+
+Suppose we perform inference on `test2017` using pretrained MaskFormer with ResNet-50 backbone.
+
+```shell
+# test with single gpu
+CUDA_VISIBLE_DEVICES=0 python tools/test.py \
+    configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py \
+    checkpoints/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth \
+    --format-only \
+    --cfg-options data.test.ann_file=data/coco/annotations/panoptic_image_info_test-dev2017.json data.test.img_prefix=data/coco/test2017 \
+    --eval-options jsonfile_prefix=work_dirs/maskformer/results
+```
+
+### Rename files and zip results
+
+After inference, the panoptic segmentation results (a json file and a directory where the masks are stored) will be in `WORK_DIR`. We should rename them according to the naming convention described on [COCO's Website](https://cocodataset.org/#upload). Finally, we need to compress the json and the directory where the masks are stored into a zip file, and rename the zip file according to the naming convention. Note that the zip file should **directly** contains the above two files.
+
+The commands to rename files and zip results:
+
+```shell
+# In WORK_DIR, we have panoptic segmentation results: 'panoptic' and 'results.panoptic.json'.
+cd ${WORK_DIR}
+
+# replace '[algorithm_name]' with the name of algorithm you used.
+mv ./panoptic ./panoptic_test-dev2017_[algorithm_name]_results
+mv ./results.panoptic.json ./panoptic_test-dev2017_[algorithm_name]_results.json
+zip panoptic_test-dev2017_[algorithm_name]_results.zip -ur panoptic_test-dev2017_[algorithm_name]_results panoptic_test-dev2017_[algorithm_name]_results.json
+```
diff --git a/tools/misc/gen_coco_panoptic_test_info.py b/tools/misc/gen_coco_panoptic_test_info.py
new file mode 100644
index 00000000000..5ad315dcbf6
--- /dev/null
+++ b/tools/misc/gen_coco_panoptic_test_info.py
@@ -0,0 +1,34 @@
+import argparse
+import os.path as osp
+
+import mmcv
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Generate COCO test image information '
+        'for COCO panoptic segmentation.')
+    parser.add_argument('data_root', help='Path to COCO annotation directory.')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+    data_root = args.data_root
+    val_info = mmcv.load(osp.join(data_root, 'panoptic_val2017.json'))
+    test_old_info = mmcv.load(
+        osp.join(data_root, 'image_info_test-dev2017.json'))
+
+    # replace categories from image_info_test-dev2017.json
+    # with categories from panoptic_val2017.json which
+    # has attribute `isthing`.
+    test_info = test_old_info
+    test_info.update({'categories': val_info['categories']})
+    mmcv.dump(test_info,
+              osp.join(data_root, 'panoptic_image_info_test-dev2017.json'))
+
+
+if __name__ == '__main__':
+    main()

From bab144cd53f9f486f988b633d6a0e7b973634c81 Mon Sep 17 00:00:00 2001
From: Czm369 <40661020+Czm369@users.noreply.github.com>
Date: Tue, 29 Mar 2022 14:17:51 +0800
Subject: [PATCH 42/42] Bump versions to v2.23.0 (#7555)

* Bump versions to v2.23.0

* update log

* replace two Efficientnet

* update mask2former log

* add maskformer and mask2former in README_zh-CN.md

* add efficientnet in README.md and  README_zh-CN.md

* update changelog

* simplify highlights

* update changelog.

* update changelog again

* add some highlights
---
 README.md                 | 10 ++++---
 README_zh-CN.md           | 13 +++++----
 docker/serve/Dockerfile   |  2 +-
 docs/en/changelog.md      | 56 +++++++++++++++++++++++++++++++++++++++
 docs/en/get_started.md    |  1 +
 docs/en/model_zoo.md      |  8 ++++++
 docs/zh_cn/get_started.md |  1 +
 mmdet/version.py          |  2 +-
 8 files changed, 82 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index d739ac5504e..667f79261cb 100644
--- a/README.md
+++ b/README.md
@@ -74,11 +74,11 @@ This project is released under the [Apache 2.0 license](LICENSE).
 
 ## Changelog
 
-**2.22.0** was released in 24/2/2022:
+**2.23.0** was released in 28/3/2022:
 
-- Support [MaskFormer](configs/maskformer), [DyHead](configs/dyhead), [OpenImages Dataset](configs/openimages) and [TIMM backbone](configs/timm_example)
-- Support visualization for Panoptic Segmentation
-- Release a good recipe of using ResNet in object detectors pre-trained by [ResNet Strikes Back](https://arxiv.org/abs/2110.00476), which consistently brings about 3~4 mAP improvements over RetinaNet, Faster/Mask/Cascade Mask R-CNN
+- Support [Mask2Former](configs/mask2former) and [EfficientNet](configs/efficientnet)
+- Support setting data root through environment variable `MMDET_DATASETS`, users don't have to modify the corresponding path in config files anymore.
+- Find a good recipe for fine-tuning high precision ResNet backbone pre-trained by Torchvision.
 
 Please refer to [changelog.md](docs/en/changelog.md) for details and release history.
 
@@ -164,6 +164,7 @@ Results and models are available in the [model zoo](docs/en/model_zoo.md).
         <ul>
           <li><a href="configs/panoptic_fpn">Panoptic FPN (CVPR'2019)</a></li>
           <li><a href="configs/maskformer">MaskFormer (NeurIPS'2021)</a></li>
+          <li><a href="configs/mask2former">Mask2Former (ArXiv'2021)</a></li>
         </ul>
       </td>
       <td>
@@ -228,6 +229,7 @@ Results and models are available in the [model zoo](docs/en/model_zoo.md).
         <li><a href="configs/swin">Swin (CVPR'2021)</a></li>
         <li><a href="configs/pvt">PVTv2 (ArXiv'2021)</a></li>
         <li><a href="configs/resnet_strikes_back">ResNet strikes back (ArXiv'2021)</a></li>
+        <li><a href="configs/efficientnet">EfficientNet (ArXiv'2021)</a></li>
       </ul>
       </td>
       <td>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 4cb5d384648..3ec0c000137 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -73,13 +73,13 @@ MMDetection 是一个基于 PyTorch 的目标检测开源工具箱。它是 [Ope
 
 ## 更新日志
 
-最新的 **2.22.0** 版本已经在 2022.02.24 发布:
+最新的 **2.23.0** 版本已经在 2022.03.28 发布:
 
-- 支持 [MaskFormer](configs/maskformer)，[DyHead](configs/dyhead)，[OpenImages Dataset](configs/openimages) 和 [TIMM backbone](configs/timm_example)
-- 支持全景分割可视化
-- 发布了一个在目标检测任务中使用 ResNet 的好方法，它是由 [ResNet Strikes Back](https://arxiv.org/abs/2110.00476) 预训练的，并且能稳定的在 RetinaNet, Faster/Mask/Cascade Mask R-CNN 上带来约 3-4 mAP 的提升
+- 支持 [Mask2Former](configs/mask2former) 和 [Efficientnet](configs/efficientnet)
+- 支持通环境变量 `MMDET_DATASETS` 设置数据根目录，因此无需修改配置文件中对应的路径。
+- 发现一个很好的方法来微调由 Torchvision 预训练的高精度 ResNet 主干。
 
-如果想了解更多版本更新细节和历史信息，请阅读[更新日志](docs/changelog.md)。
+如果想了解更多版本更新细节和历史信息，请阅读[更新日志](docs/en/changelog.md)。
 
 如果想了解 MMDetection 不同版本之间的兼容性, 请参考[兼容性说明文档](docs/zh_cn/compatibility.md)。
 
@@ -162,6 +162,8 @@ MMDetection 是一个基于 PyTorch 的目标检测开源工具箱。它是 [Ope
       <td>
         <ul>
           <li><a href="configs/panoptic_fpn">Panoptic FPN (CVPR'2019)</a></li>
+          <li><a href="configs/maskformer">MaskFormer (NeurIPS'2021)</a></li>
+          <li><a href="configs/mask2former">Mask2Former (ArXiv'2021)</a></li>
         </ul>
       </td>
       <td>
@@ -226,6 +228,7 @@ MMDetection 是一个基于 PyTorch 的目标检测开源工具箱。它是 [Ope
         <li><a href="configs/swin">Swin (CVPR'2021)</a></li>
         <li><a href="configs/pvt">PVTv2 (ArXiv'2021)</a></li>
         <li><a href="configs/resnet_strikes_back">ResNet strikes back (ArXiv'2021)</a></li>
+        <li><a href="configs/efficientnet">EfficientNet (ArXiv'2021)</a></li>
       </ul>
       </td>
       <td>
diff --git a/docker/serve/Dockerfile b/docker/serve/Dockerfile
index ffa81c82ceb..608ea64565c 100644
--- a/docker/serve/Dockerfile
+++ b/docker/serve/Dockerfile
@@ -4,7 +4,7 @@ ARG CUDNN="7"
 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
 
 ARG MMCV="1.3.17"
-ARG MMDET="2.22.0"
+ARG MMDET="2.23.0"
 
 ENV PYTHONUNBUFFERED TRUE
 
diff --git a/docs/en/changelog.md b/docs/en/changelog.md
index 7a3b18fcfa2..7b3585aa2f3 100644
--- a/docs/en/changelog.md
+++ b/docs/en/changelog.md
@@ -1,5 +1,61 @@
 ## Changelog
 
+### v2.23.0 (28/3/2022)
+
+#### Highlights
+
+- Support Mask2Former: [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)
+- Support EfficientNet: [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946)
+- Support setting data root through environment variable `MMDET_DATASETS`, users don't have to modify the corresponding path in config files anymore.
+- Find a good recipe for fine-tuning high precision ResNet backbone pre-trained by Torchvision.
+
+#### New Features
+
+- Support [Mask2Former](configs/mask2former)(#6938)(#7466)(#7471)
+- Support [EfficientNet](configs/efficientnet) (#7514)
+- Support setting data root through environment variable `MMDET_DATASETS`, users don't have to modify the corresponding path in config files anymore. (#7386)
+- Support setting different seeds to different ranks (#7432)
+- Update the `dist_train.sh` so that the script can be used to support launching multi-node training on machines without slurm (#7415)
+- Find a good recipe for fine-tuning high precision ResNet backbone pre-trained by Torchvision (#7489)
+
+#### Bug Fixes
+
+- Fix bug in VOC unit test which removes the data directory (#7270)
+- Adjust the order of `get_classes` and `FileClient` (#7276)
+- Force the inputs of `get_bboxes` in yolox_head to float32 (#7324)
+- Fix misplaced arguments in LoadPanopticAnnotations (#7388)
+- Fix reduction=mean in CELoss. (#7449)
+- Update unit test of CrossEntropyCost (#7537)
+- Fix memory leaking in panpotic segmentation evaluation (#7538)
+- Fix the bug of shape broadcast in YOLOv3 (#7551)
+
+#### Improvements
+
+- Add Chinese version of onnx2tensorrt.md (#7219)
+- Update colab tutorials (#7310)
+- Update information about Localization Distillation (#7350)
+- Add Chinese version of `finetune.md` (#7178)
+- Update YOLOX log for non square input (#7235)
+- Add `nproc` in `coco_panoptic.py` for panoptic quality computing (#7315)
+- Allow to set channel_order in LoadImageFromFile (#7258)
+- Take point sample related functions out of mask_point_head (#7353)
+- Add instance evaluation for coco_panoptic (#7313)
+- Enhance the robustness of analyze_logs.py (#7407)
+- Supplementary notes of sync_random_seed (#7440)
+- Update docstring of cross entropy loss (#7472)
+- Update pascal voc result (#7503)
+- We create How-to documentation to record any questions about How to xxx. In this version, we added
+  - How to use Mosaic augmentation (#7507)
+  - How to use backbone in mmcls (#7438)
+  - How to produce and submit the prediction results of panoptic segmentation models on COCO test-dev set (#7430))
+
+#### Contributors
+
+A total of 27 developers contributed to this release.
+Thanks @ZwwWayne, @haofanwang, @shinya7y, @chhluo, @yangrisheng, @triple-Mu, @jbwang1997, @HikariTJU, @imflash217, @274869388, @zytx121, @matrixgame2018, @jamiechoi1995, @BIGWangYuDong, @JingweiZhang12, @Xiangxu-0103, @hhaAndroid, @jshilong, @osbm, @ceroytres, @bunge-bedstraw-herb, @Youth-Got, @daavoo, @jiangyitong, @RangiLyu, @CCODING04, @yarkable
+
+
+
 ### v2.22.0 (24/2/2022)
 
 #### Highlights
diff --git a/docs/en/get_started.md b/docs/en/get_started.md
index 1bd65243948..33cd926d84c 100644
--- a/docs/en/get_started.md
+++ b/docs/en/get_started.md
@@ -12,6 +12,7 @@ Compatible MMDetection and MMCV versions are shown as below. Please install the
 | MMDetection version |       MMCV version        |
 |:-------------------:|:-------------------------:|
 |       master        | mmcv-full>=1.3.17, <1.5.0 |
+|       2.23.0        | mmcv-full>=1.3.17, <1.5.0 |
 |       2.22.0        | mmcv-full>=1.3.17, <1.5.0 |
 |       2.21.0        | mmcv-full>=1.3.17, <1.5.0 |
 |       2.20.0        | mmcv-full>=1.3.17, <1.5.0 |
diff --git a/docs/en/model_zoo.md b/docs/en/model_zoo.md
index 15bb4894ae0..b01762d0701 100644
--- a/docs/en/model_zoo.md
+++ b/docs/en/model_zoo.md
@@ -254,6 +254,14 @@ Please refer to [MaskFormer](https://github.com/open-mmlab/mmdetection/blob/mast
 
 Please refer to [DyHead](https://github.com/open-mmlab/mmdetection/blob/master/configs/dyhead) for details.
 
+### Mask2Former
+
+Please refer to [Mask2Former](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former) for details.
+
+### Efficientnet
+
+Please refer to [Efficientnet](https://github.com/open-mmlab/mmdetection/blob/master/configs/efficientnet) for details.
+
 ### Other datasets
 
 We also benchmark some methods on [PASCAL VOC](https://github.com/open-mmlab/mmdetection/blob/master/configs/pascal_voc), [Cityscapes](https://github.com/open-mmlab/mmdetection/blob/master/configs/cityscapes), [OpenImages](https://github.com/open-mmlab/mmdetection/blob/master/configs/openimages) and [WIDER FACE](https://github.com/open-mmlab/mmdetection/blob/master/configs/wider_face).
diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md
index b6a2c60f396..087d8b7a48c 100644
--- a/docs/zh_cn/get_started.md
+++ b/docs/zh_cn/get_started.md
@@ -12,6 +12,7 @@ MMDetection 和 MMCV 版本兼容性如下所示，需要安装正确的 MMCV 
 | MMDetection 版本 |          MMCV 版本          |
 |:--------------:|:-------------------------:|
 |     master     | mmcv-full>=1.3.17, <1.5.0 |
+|     2.23.0     | mmcv-full>=1.3.17, <1.5.0 |
 |     2.22.0     | mmcv-full>=1.3.17, <1.5.0 |
 |     2.21.0     | mmcv-full>=1.3.17, <1.5.0 |
 |     2.20.0     | mmcv-full>=1.3.17, <1.5.0 |
diff --git a/mmdet/version.py b/mmdet/version.py
index 7618b043734..257768bb79c 100644
--- a/mmdet/version.py
+++ b/mmdet/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-__version__ = '2.22.0'
+__version__ = '2.23.0'
 short_version = __version__