From 86e2f125668811e9751fd6b49842cae995fc644b Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Wed, 3 May 2023 23:03:42 +0900
Subject: [PATCH 01/11] add configs from original repo
 https://huggingface.co/dustysys/ddetailer/blob/main/mmdet

---
 config/mmdet_anime-face_yolov3.py     |  47 ++++
 config/mmdet_dd-person_mask2former.py | 335 ++++++++++++++++++++++++++
 2 files changed, 382 insertions(+)
 create mode 100644 config/mmdet_anime-face_yolov3.py
 create mode 100644 config/mmdet_dd-person_mask2former.py

diff --git a/config/mmdet_anime-face_yolov3.py b/config/mmdet_anime-face_yolov3.py
new file mode 100644
index 0000000..c644633
--- /dev/null
+++ b/config/mmdet_anime-face_yolov3.py
@@ -0,0 +1,47 @@
+model = dict(type='YOLOV3',
+             backbone=dict(type='Darknet', depth=53, out_indices=(3, 4, 5)),
+             neck=dict(type='YOLOV3Neck',
+                       num_scales=3,
+                       in_channels=[1024, 512, 256],
+                       out_channels=[512, 256, 128]),
+             bbox_head=dict(type='YOLOV3Head',
+                            num_classes=1,
+                            in_channels=[512, 256, 128],
+                            out_channels=[1024, 512, 256],
+                            anchor_generator=dict(type='YOLOAnchorGenerator',
+                                                  base_sizes=[[(116, 90),
+                                                               (156, 198),
+                                                               (373, 326)],
+                                                              [(30, 61),
+                                                               (62, 45),
+                                                               (59, 119)],
+                                                              [(10, 13),
+                                                               (16, 30),
+                                                               (33, 23)]],
+                                                  strides=[32, 16, 8]),
+                            bbox_coder=dict(type='YOLOBBoxCoder'),
+                            featmap_strides=[32, 16, 8]),
+             test_cfg=dict(nms_pre=1000,
+                           min_bbox_size=0,
+                           score_thr=0.05,
+                           conf_thr=0.005,
+                           nms=dict(type='nms', iou_threshold=0.45),
+                           max_per_img=100))
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='MultiScaleFlipAug',
+         img_scale=(608, 608),
+         flip=False,
+         transforms=[
+             dict(type='Resize', keep_ratio=True),
+             dict(type='RandomFlip'),
+             dict(type='Normalize',
+                  mean=[0, 0, 0],
+                  std=[255.0, 255.0, 255.0],
+                  to_rgb=True),
+             dict(type='Pad', size_divisor=32),
+             dict(type='ImageToTensor', keys=['img']),
+             dict(type='Collect', keys=['img'])
+         ])
+]
+data = dict(test=dict(pipeline=test_pipeline))
diff --git a/config/mmdet_dd-person_mask2former.py b/config/mmdet_dd-person_mask2former.py
new file mode 100644
index 0000000..2abc201
--- /dev/null
+++ b/config/mmdet_dd-person_mask2former.py
@@ -0,0 +1,335 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/dd-person_mask2former/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='Resize',
+        img_scale=(1024, 1024),
+        ratio_range=(0.1, 2.0),
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_size=(1024, 1024),
+        crop_type='absolute',
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(
+        type='FilterAnnotations', min_gt_bbox_wh=(1e-05, 1e-05), by_mask=True),
+    dict(
+        type='Pad',
+        size=(1024, 1024),
+        pad_val=dict(img=(128, 128, 128), masks=0, seg=255)),
+    dict(
+        type='Normalize',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        to_rgb=True),
+    dict(type='DefaultFormatBundle', img_to_float=True),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(
+                type='Pad',
+                size_divisor=32,
+                pad_val=dict(img=(128, 128, 128), masks=0, seg=255)),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375],
+                to_rgb=True),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=1,
+    train=dict(
+        type='CocoDataset',
+        ann_file='data/dd-person_mask2former/annotations/train.json',
+        img_prefix='data/dd-person_mask2former/train/',
+        pipeline=[
+            dict(type='LoadImageFromFile', to_float32=True),
+            dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(
+                type='Resize',
+                img_scale=(1024, 1024),
+                ratio_range=(0.1, 2.0),
+                multiscale_mode='range',
+                keep_ratio=True),
+            dict(
+                type='RandomCrop',
+                crop_size=(1024, 1024),
+                crop_type='absolute',
+                recompute_bbox=True,
+                allow_negative_crop=True),
+            dict(
+                type='FilterAnnotations',
+                min_gt_bbox_wh=(1e-05, 1e-05),
+                by_mask=True),
+            dict(
+                type='Pad',
+                size=(1024, 1024),
+                pad_val=dict(img=(128, 128, 128), masks=0, seg=255)),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375],
+                to_rgb=True),
+            dict(type='DefaultFormatBundle', img_to_float=True),
+            dict(
+                type='Collect',
+                keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
+        ]),
+    val=dict(
+        type='CocoDataset',
+        ann_file='data/dd-person_mask2former/annotations/val.json',
+        img_prefix='data/dd-person_mask2former/val/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(1333, 800),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip'),
+                    dict(
+                        type='Pad',
+                        size_divisor=32,
+                        pad_val=dict(img=(128, 128, 128), masks=0, seg=255)),
+                    dict(
+                        type='Normalize',
+                        mean=[123.675, 116.28, 103.53],
+                        std=[58.395, 57.12, 57.375],
+                        to_rgb=True),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]),
+    test=dict(
+        type='CocoDataset',
+        ann_file='data/dd-person_mask2former/annotations/val.json',
+        img_prefix='data/dd-person_mask2former/val/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(1333, 800),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip'),
+                    dict(
+                        type='Pad',
+                        size_divisor=32,
+                        pad_val=dict(img=(128, 128, 128), masks=0, seg=255)),
+                    dict(
+                        type='Normalize',
+                        mean=[123.675, 116.28, 103.53],
+                        std=[58.395, 57.12, 57.375],
+                        to_rgb=True),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]))
+evaluation = dict(
+    interval=2000,
+    metric=['bbox', 'segm'],
+    dynamic_intervals=[(400001, 400000)])
+checkpoint_config = dict(
+    interval=2000, by_epoch=False, save_last=True, max_keep_ckpts=10)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook', by_epoch=False),
+        dict(type='TensorboardLoggerHook', by_epoch=False)
+    ])
+custom_hooks = [dict(type='NumClassCheckHook')]
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = 'checkpoints/mask2former_r50_lsj_8x2_50e_coco_20220506_191028-8e96e88b.pth'
+resume_from = 'checkpoints/mask2former_r50_lsj_8x2_50e_coco_20220506_191028-8e96e88b.pth'
+workflow = [('train', 2000)]
+opencv_num_threads = 0
+mp_start_method = 'fork'
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+num_things_classes = 1
+num_stuff_classes = 0
+num_classes = 1
+model = dict(
+    type='Mask2Former',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    panoptic_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[256, 512, 1024, 2048],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_things_classes=1,
+        num_stuff_classes=0,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        pixel_decoder=dict(
+            type='MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=False,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='SinePositionalEncoding', num_feats=128, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        transformer_decoder=dict(
+            type='DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=9,
+            transformerlayers=dict(
+                type='DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True),
+                feedforward_channels=2048,
+                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0, 0.1]),
+        loss_mask=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0)),
+    panoptic_fusion_head=dict(
+        type='MaskFormerFusionHead',
+        num_things_classes=1,
+        num_stuff_classes=0,
+        loss_panoptic=None,
+        init_cfg=None),
+    train_cfg=dict(
+        num_points=12544,
+        oversample_ratio=3.0,
+        importance_sample_ratio=0.75,
+        assigner=dict(
+            type='MaskHungarianAssigner',
+            cls_cost=dict(type='ClassificationCost', weight=2.0),
+            mask_cost=dict(
+                type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True),
+            dice_cost=dict(
+                type='DiceCost', weight=5.0, pred_act=True, eps=1.0)),
+        sampler=dict(type='MaskPseudoSampler')),
+    test_cfg=dict(
+        panoptic_on=False,
+        semantic_on=False,
+        instance_on=True,
+        max_per_image=100,
+        iou_thr=0.8,
+        filter_low_score=True),
+    init_cfg=None)
+image_size = (1024, 1024)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+optimizer = dict(
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.05,
+    eps=1e-08,
+    betas=(0.9, 0.999),
+    paramwise_cfg=dict(
+        custom_keys=dict(
+            backbone=dict(lr_mult=0.1, decay_mult=1.0),
+            query_embed=dict(lr_mult=1.0, decay_mult=0.0),
+            query_feat=dict(lr_mult=1.0, decay_mult=0.0),
+            level_embed=dict(lr_mult=1.0, decay_mult=0.0)),
+        norm_decay_mult=0.0))
+optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2))
+lr_config = dict(
+    policy='step',
+    gamma=0.1,
+    by_epoch=False,
+    step=[327778, 355092],
+    warmup='linear',
+    warmup_by_epoch=False,
+    warmup_ratio=1.0,
+    warmup_iters=10)
+max_iters = 400000
+runner = dict(type='IterBasedRunner', max_iters=400000)
+interval = 2000
+dynamic_intervals = [(400001, 400000)]
+pad_cfg = dict(img=(128, 128, 128), masks=0, seg=255)
+work_dir = './work_dirs\dd-person_mask2former'
+auto_resume = False
+gpu_ids = [0]

From 01764f01a9af564b7586e8bceb9d1eff2e8d0b42 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Thu, 4 May 2023 00:30:47 +0900
Subject: [PATCH 02/11] support partial update mmdet configs

---
 scripts/ddetailer.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/scripts/ddetailer.py b/scripts/ddetailer.py
index 7841d8e..9d2fe28 100644
--- a/scripts/ddetailer.py
+++ b/scripts/ddetailer.py
@@ -4,6 +4,7 @@
 from PIL import Image
 import numpy as np
 import gradio as gr
+import shutil
 
 from modules import processing, images
 from modules import scripts, script_callbacks, shared, devices, modelloader
@@ -49,14 +50,28 @@ def startup():
         run(f'"{python}" -m mim install mmcv-full', desc=f"Installing mmcv-full", errdesc=f"Couldn't install mmcv-full")
         run(f'"{python}" -m pip install mmdet', desc=f"Installing mmdet", errdesc=f"Couldn't install mmdet")
 
+    bbox_path = os.path.join(dd_models_path, "bbox")
+    segm_path = os.path.join(dd_models_path, "segm")
     if (len(list_models(dd_models_path)) == 0):
         print("No detection models found, downloading...")
-        bbox_path = os.path.join(dd_models_path, "bbox")
-        segm_path = os.path.join(dd_models_path, "segm")
         load_file_from_url("https://huggingface.co/dustysys/ddetailer/resolve/main/mmdet/bbox/mmdet_anime-face_yolov3.pth", bbox_path)
-        load_file_from_url("https://huggingface.co/dustysys/ddetailer/raw/main/mmdet/bbox/mmdet_anime-face_yolov3.py", bbox_path)
         load_file_from_url("https://huggingface.co/dustysys/ddetailer/resolve/main/mmdet/segm/mmdet_dd-person_mask2former.pth", segm_path)
-        load_file_from_url("https://huggingface.co/dustysys/ddetailer/raw/main/mmdet/segm/mmdet_dd-person_mask2former.py", segm_path)
+
+    import torch
+    print("Check config files...")
+    config_dir = os.path.join(scripts.basedir(), "config")
+    configs = [ "mmdet_anime-face_yolov3.py", "mmdet_dd-person_mask2former.py" ]
+
+    destdir = bbox_path
+    for confpy in configs:
+        conf = os.path.join(config_dir, confpy)
+        dest = os.path.join(destdir, confpy)
+        if not os.path.exists(dest):
+            print(f"Copy config file: {confpy}..")
+            shutil.copy(conf, destdir)
+        destdir = segm_path
+
+    print("Done")
 
 startup()
 

From 0b8faf9b76605a14971dbf5d3fa44970f7b16d12 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Thu, 4 May 2023 02:33:46 +0900
Subject: [PATCH 03/11] support mmdet v3 based on Bing-su's work

---
 scripts/ddetailer.py | 116 +++++++++++++++++++++++++++++++++----------
 1 file changed, 90 insertions(+), 26 deletions(-)

diff --git a/scripts/ddetailer.py b/scripts/ddetailer.py
index 9d2fe28..1fdfa1a 100644
--- a/scripts/ddetailer.py
+++ b/scripts/ddetailer.py
@@ -372,6 +372,8 @@ def create_segmask_preview(results, image):
     labels = results[0]
     bboxes = results[1]
     segms = results[2]
+    if not mmcv_legacy:
+        scores = results[3]
 
     cv2_image = np.array(image)
     cv2_image = cv2_image[:, :, ::-1].copy()
@@ -389,7 +391,10 @@ def create_segmask_preview(results, image):
         cv2_image = np.where(cv2_mask_rgb == 255, color_image, cv2_image)
         text_color = tuple([int(x) for x in ( color[0][0] - 100 )])
         name = labels[i]
-        score = bboxes[i][4]
+        if mmcv_legacy:
+            score = bboxes[i][4]
+        else:
+            score = scores[i]
         score = str(score)[:4]
         text = name + ":" + score
         cv2.putText(cv2_image, text, (centroid_x - 30, centroid_y), cv2.FONT_HERSHEY_DUPLEX, 0.4, text_color, 1, cv2.LINE_AA)
@@ -470,9 +475,16 @@ def create_segmasks(results):
     return segmasks
 
 import mmcv
-from mmdet.core import get_classes
-from mmdet.apis import (inference_detector,
+
+try:
+    from mmdet.core import get_classes
+    from mmdet.apis import (inference_detector,
                         init_detector)
+    mmcv_legacy = True
+except ImportError:
+    from mmdet.evaluation import get_classes
+    from mmdet.apis import inference_detector, init_detector
+    mmcv_legacy = False
 
 def get_device():
     device_id = shared.cmd_opts.device_id
@@ -494,27 +506,52 @@ def inference_mmdet_segm(image, modelname, conf_thres, label):
     model_checkpoint = modelpath(modelname)
     model_config = os.path.splitext(model_checkpoint)[0] + ".py"
     model_device = get_device()
-    model = init_detector(model_config, model_checkpoint, device=model_device)
-    mmdet_results = inference_detector(model, np.array(image))
-    bbox_results, segm_results = mmdet_results
+    if mmcv_legacy:
+        model = init_detector(model_config, model_checkpoint, device=model_device)
+        mmdet_results = inference_detector(model, np.array(image))
+        bbox_results, segm_results = mmdet_results
+    else:
+        model = init_detector(model_config, model_checkpoint, palette="random", device=model_device)
+        mmdet_results = inference_detector(model, np.array(image)).pred_instances
+        bboxes = mmdet_results.bboxes.numpy()
+
     dataset = modeldataset(modelname)
     classes = get_classes(dataset)
-    labels = [
-        np.full(bbox.shape[0], i, dtype=np.int32)
-        for i, bbox in enumerate(bbox_results)
-    ]
-    n,m = bbox_results[0].shape
+    if mmcv_legacy:
+        labels = [
+            np.full(bbox.shape[0], i, dtype=np.int32)
+            for i, bbox in enumerate(bbox_results)
+        ]
+        n, m = bbox_results[0].shape
+    else:
+        n, m = bboxes.shape
     if (n == 0):
-        return [[],[],[]]
-    labels = np.concatenate(labels)
-    bboxes = np.vstack(bbox_results)
-    segms = mmcv.concat_list(segm_results)
-    filter_inds = np.where(bboxes[:,-1] > conf_thres)[0]
-    results = [[],[],[]]
+        if mmcv_legacy:
+            return [[],[],[]]
+        else:
+            return [[],[],[],[]]
+
+    if mmcv_legacy:
+        labels = np.concatenate(labels)
+        bboxes = np.vstack(bbox_results)
+        segms = mmcv.concat_list(segm_results)
+
+        filter_inds = np.where(bboxes[:,-1] > conf_thres)[0]
+        results = [[],[],[]]
+    else:
+        labels = mmdet_results.labels
+        segms = mmdet_results.masks.numpy()
+        scores = mmdet_results.scores.numpy()
+
+        filter_inds = np.where(mmdet_results.scores > conf_thres)[0]
+        results = [[],[],[],[]]
+
     for i in filter_inds:
         results[0].append(label + "-" + classes[labels[i]])
         results[1].append(bboxes[i])
         results[2].append(segms[i])
+        if not mmcv_legacy:
+            results[3].append(scores[i])
 
     return results
 
@@ -522,29 +559,56 @@ def inference_mmdet_bbox(image, modelname, conf_thres, label):
     model_checkpoint = modelpath(modelname)
     model_config = os.path.splitext(model_checkpoint)[0] + ".py"
     model_device = get_device()
-    model = init_detector(model_config, model_checkpoint, device=model_device)
-    results = inference_detector(model, np.array(image))
+
+    if mmcv_legacy:
+        model = init_detector(model_config, model_checkpoint, device=model_device)
+        results = inference_detector(model, np.array(image))
+    else:
+        model = init_detector(model_config, model_checkpoint, device=model_device, palette="random")
+        output = inference_detector(model, np.array(image)).pred_instances
     cv2_image = np.array(image)
     cv2_image = cv2_image[:, :, ::-1].copy()
     cv2_gray = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2GRAY)
 
     segms = []
-    for (x0, y0, x1, y1, conf) in results[0]:
+    bboxes = []
+    if mmcv_legacy:
+        for (x0, y0, x1, y1, conf) in results[0]:
+            bboxes.append([x0, y0, x1, y1])
+    else:
+        bboxes = output.bboxes
+
+    for x0, y0, x1, y1 in bboxes:
         cv2_mask = np.zeros((cv2_gray.shape), np.uint8)
         cv2.rectangle(cv2_mask, (int(x0), int(y0)), (int(x1), int(y1)), 255, -1)
         cv2_mask_bool = cv2_mask.astype(bool)
         segms.append(cv2_mask_bool)
-    
-    n,m = results[0].shape
+
+    if mmcv_legacy:
+        n,m = results[0].shape
+    else:
+        n,m = output.bboxes.shape
     if (n == 0):
-        return [[],[],[]]
-    bboxes = np.vstack(results[0])
-    filter_inds = np.where(bboxes[:,-1] > conf_thres)[0]
-    results = [[],[],[]]
+        if mmcv_legacy:
+            return [[],[],[]]
+        else:
+            return [[],[],[],[]]
+    if mmcv_legacy:
+        bboxes = np.vstack(results[0])
+        filter_inds = np.where(bboxes[:,-1] > conf_thres)[0]
+        results = [[],[],[]]
+    else:
+        bboxes = output.bboxes.numpy()
+        scores = output.scores.numpy()
+        filter_inds = np.where(scores > conf_thres)[0]
+        results = [[],[],[],[]]
+
     for i in filter_inds:
         results[0].append(label)
         results[1].append(bboxes[i])
         results[2].append(segms[i])
+        if not mmcv_legacy:
+            results[3].append(scores[i])
 
     return results
 

From 4f9f3c14a715bbeb4f359621ab0355c7d42cb059 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Thu, 4 May 2023 12:20:14 +0900
Subject: [PATCH 04/11] add config files  for mmdetection v3

---
 config/coco_panoptic.py                       |  98 +++++++
 ...k2former_r50_8xb2-lsj-50e_coco-panoptic.py | 265 ++++++++++++++++++
 config/mmdet_anime-face_yolov3-v3.py          | 150 ++++++++++
 config/mmdet_dd-person_mask2former-v3.py      | 105 +++++++
 4 files changed, 618 insertions(+)
 create mode 100644 config/coco_panoptic.py
 create mode 100644 config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py
 create mode 100644 config/mmdet_anime-face_yolov3-v3.py
 create mode 100644 config/mmdet_dd-person_mask2former-v3.py

diff --git a/config/coco_panoptic.py b/config/coco_panoptic.py
new file mode 100644
index 0000000..ea68126
--- /dev/null
+++ b/config/coco_panoptic.py
@@ -0,0 +1,98 @@
+# dataset settings
+dataset_type = "CocoPanopticDataset"
+# data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+data_root = "s3://openmmlab/datasets/detection/coco/"
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type="LoadImageFromFile", backend_args=backend_args),
+    dict(type="LoadPanopticAnnotations", backend_args=backend_args),
+    dict(type="Resize", scale=(1333, 800), keep_ratio=True),
+    dict(type="RandomFlip", prob=0.5),
+    dict(type="PackDetInputs"),
+]
+test_pipeline = [
+    dict(type="LoadImageFromFile", backend_args=backend_args),
+    dict(type="Resize", scale=(1333, 800), keep_ratio=True),
+    dict(type="LoadPanopticAnnotations", backend_args=backend_args),
+    dict(
+        type="PackDetInputs",
+        meta_keys=("img_id", "img_path", "ori_shape", "img_shape", "scale_factor"),
+    ),
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=True),
+    batch_sampler=dict(type="AspectRatioBatchSampler"),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file="annotations/panoptic_train2017.json",
+        data_prefix=dict(img="train2017/", seg="annotations/panoptic_train2017/"),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args,
+    ),
+)
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file="annotations/panoptic_val2017.json",
+        data_prefix=dict(img="val2017/", seg="annotations/panoptic_val2017/"),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args,
+    ),
+)
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type="CocoPanopticMetric",
+    ann_file=data_root + "annotations/panoptic_val2017.json",
+    seg_prefix=data_root + "annotations/panoptic_val2017/",
+    backend_args=backend_args,
+)
+test_evaluator = val_evaluator
+
+# inference on test dataset and
+# format the output results for submission.
+# test_dataloader = dict(
+#     batch_size=1,
+#     num_workers=1,
+#     persistent_workers=True,
+#     drop_last=False,
+#     sampler=dict(type='DefaultSampler', shuffle=False),
+#     dataset=dict(
+#         type=dataset_type,
+#         data_root=data_root,
+#         ann_file='annotations/panoptic_image_info_test-dev2017.json',
+#         data_prefix=dict(img='test2017/'),
+#         test_mode=True,
+#         pipeline=test_pipeline))
+# test_evaluator = dict(
+#     type='CocoPanopticMetric',
+#     format_only=True,
+#     ann_file=data_root + 'annotations/panoptic_image_info_test-dev2017.json',
+#     outfile_prefix='./work_dirs/coco_panoptic/test')
diff --git a/config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py b/config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py
new file mode 100644
index 0000000..b67d9b0
--- /dev/null
+++ b/config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py
@@ -0,0 +1,265 @@
+_base_ = ["./coco_panoptic.py"]
+image_size = (1024, 1024)
+batch_augments = [
+    dict(
+        type="BatchFixedSizePad",
+        size=image_size,
+        img_pad_value=0,
+        pad_mask=True,
+        mask_pad_value=0,
+        pad_seg=True,
+        seg_pad_value=255,
+    )
+]
+data_preprocessor = dict(
+    type="DetDataPreprocessor",
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_size_divisor=32,
+    pad_mask=True,
+    mask_pad_value=0,
+    pad_seg=True,
+    seg_pad_value=255,
+    batch_augments=batch_augments,
+)
+
+num_things_classes = 1
+num_stuff_classes = 0
+num_classes = num_things_classes + num_stuff_classes
+model = dict(
+    type="Mask2Former",
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type="ResNet",
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type="BN", requires_grad=False),
+        norm_eval=True,
+        style="pytorch",
+        init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet50"),
+    ),
+    panoptic_head=dict(
+        type="Mask2FormerHead",
+        in_channels=[256, 512, 1024, 2048],  # pass to pixel_decoder inside
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        pixel_decoder=dict(
+            type="MSDeformAttnPixelDecoder",
+            num_outs=3,
+            norm_cfg=dict(type="GN", num_groups=32),
+            act_cfg=dict(type="ReLU"),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        dropout=0.0,
+                        batch_first=True,
+                    ),
+                    ffn_cfg=dict(
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type="ReLU", inplace=True),
+                    ),
+                ),
+            ),
+            positional_encoding=dict(num_feats=128, normalize=True),
+        ),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256, num_heads=8, dropout=0.0, batch_first=True
+                ),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256, num_heads=8, dropout=0.0, batch_first=True
+                ),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    ffn_drop=0.0,
+                    act_cfg=dict(type="ReLU", inplace=True),
+                ),
+            ),
+            init_cfg=None,
+        ),
+        loss_cls=dict(
+            type="CrossEntropyLoss",
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction="mean",
+            class_weight=[1.0] * num_classes + [0.1],
+        ),
+        loss_mask=dict(
+            type="CrossEntropyLoss", use_sigmoid=True, reduction="mean", loss_weight=5.0
+        ),
+        loss_dice=dict(
+            type="DiceLoss",
+            use_sigmoid=True,
+            activate=True,
+            reduction="mean",
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0,
+        ),
+    ),
+    panoptic_fusion_head=dict(
+        type="MaskFormerFusionHead",
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        loss_panoptic=None,
+        init_cfg=None,
+    ),
+    train_cfg=dict(
+        num_points=12544,
+        oversample_ratio=3.0,
+        importance_sample_ratio=0.75,
+        assigner=dict(
+            type="HungarianAssigner",
+            match_costs=[
+                dict(type="ClassificationCost", weight=2.0),
+                dict(type="CrossEntropyLossCost", weight=5.0, use_sigmoid=True),
+                dict(type="DiceCost", weight=5.0, pred_act=True, eps=1.0),
+            ],
+        ),
+        sampler=dict(type="MaskPseudoSampler"),
+    ),
+    test_cfg=dict(
+        panoptic_on=True,
+        # For now, the dataset does not support
+        # evaluating semantic segmentation metric.
+        semantic_on=False,
+        instance_on=True,
+        # max_per_image is for instance segmentation.
+        max_per_image=100,
+        iou_thr=0.8,
+        # In Mask2Former's panoptic postprocessing,
+        # it will filter mask area where score is less than 0.5 .
+        filter_low_score=True,
+    ),
+    init_cfg=None,
+)
+
+# dataset settings
+data_root = "data/coco/"
+train_pipeline = [
+    dict(
+        type="LoadImageFromFile", to_float32=True, backend_args={{_base_.backend_args}}
+    ),
+    dict(
+        type="LoadPanopticAnnotations",
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True,
+        backend_args={{_base_.backend_args}},
+    ),
+    dict(type="RandomFlip", prob=0.5),
+    # large scale jittering
+    dict(
+        type="RandomResize", scale=image_size, ratio_range=(0.1, 2.0), keep_ratio=True
+    ),
+    dict(
+        type="RandomCrop",
+        crop_size=image_size,
+        crop_type="absolute",
+        recompute_bbox=True,
+        allow_negative_crop=True,
+    ),
+    dict(type="PackDetInputs"),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+val_evaluator = [
+    dict(
+        type="CocoPanopticMetric",
+        ann_file=data_root + "annotations/panoptic_val2017.json",
+        seg_prefix=data_root + "annotations/panoptic_val2017/",
+        backend_args={{_base_.backend_args}},
+    ),
+    dict(
+        type="CocoMetric",
+        ann_file=data_root + "annotations/instances_val2017.json",
+        metric=["bbox", "segm"],
+        backend_args={{_base_.backend_args}},
+    ),
+]
+test_evaluator = val_evaluator
+
+# optimizer
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+optim_wrapper = dict(
+    type="OptimWrapper",
+    optimizer=dict(
+        type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999)
+    ),
+    paramwise_cfg=dict(
+        custom_keys={
+            "backbone": dict(lr_mult=0.1, decay_mult=1.0),
+            "query_embed": embed_multi,
+            "query_feat": embed_multi,
+            "level_embed": embed_multi,
+        },
+        norm_decay_mult=0.0,
+    ),
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+)
+
+# learning policy
+max_iters = 368750
+param_scheduler = dict(
+    type="MultiStepLR",
+    begin=0,
+    end=max_iters,
+    by_epoch=False,
+    milestones=[327778, 355092],
+    gamma=0.1,
+)
+
+# Before 365001th iteration, we do evaluation every 5000 iterations.
+# After 365000th iteration, we do evaluation every 368750 iterations,
+# which means that we do evaluation at the end of training.
+interval = 5000
+dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)]
+train_cfg = dict(
+    type="IterBasedTrainLoop",
+    max_iters=max_iters,
+    val_interval=interval,
+    dynamic_intervals=dynamic_intervals,
+)
+val_cfg = dict(type="ValLoop")
+test_cfg = dict(type="TestLoop")
+
+default_hooks = dict(
+    checkpoint=dict(
+        type="CheckpointHook",
+        by_epoch=False,
+        save_last=True,
+        max_keep_ckpts=3,
+        interval=interval,
+    )
+)
+log_processor = dict(type="LogProcessor", window_size=50, by_epoch=False)
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/config/mmdet_anime-face_yolov3-v3.py b/config/mmdet_anime-face_yolov3-v3.py
new file mode 100644
index 0000000..6499e9a
--- /dev/null
+++ b/config/mmdet_anime-face_yolov3-v3.py
@@ -0,0 +1,150 @@
+# _base_ = ["../_base_/schedules/schedule_1x.py", "../_base_/default_runtime.py"]
+# model settings
+data_preprocessor = dict(type='DetDataPreprocessor',
+                         mean=[0, 0, 0],
+                         std=[255.0, 255.0, 255.0],
+                         bgr_to_rgb=True,
+                         pad_size_divisor=32)
+
+model = dict(type='YOLOV3',
+             data_preprocessor=data_preprocessor,
+             backbone=dict(type='Darknet', depth=53, out_indices=(3, 4, 5),
+             init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://darknet53')),
+             neck=dict(type='YOLOV3Neck',
+                       num_scales=3,
+                       in_channels=[1024, 512, 256],
+                       out_channels=[512, 256, 128]),
+             bbox_head=dict(type='YOLOV3Head',
+                            num_classes=1,
+                            in_channels=[512, 256, 128],
+                            out_channels=[1024, 512, 256],
+                            anchor_generator=dict(type='YOLOAnchorGenerator',
+                                                  base_sizes=[[(116, 90),
+                                                               (156, 198),
+                                                               (373, 326)],
+                                                              [(30, 61),
+                                                               (62, 45),
+                                                               (59, 119)],
+                                                              [(10, 13),
+                                                               (16, 30),
+                                                               (33, 23)]],
+                                                  strides=[32, 16, 8]),
+                            bbox_coder=dict(type='YOLOBBoxCoder'),
+                            featmap_strides=[32, 16, 8],
+                            loss_cls=dict(type='CrossEntropyLoss',
+                                          use_sigmoid=True, loss_weight=1.0, reduction='sum'),
+                            loss_conf=dict(type='CrossEntropyLoss',
+                                           use_sigmoid=True, loss_weight=1.0, reduction='sum'),
+                            loss_xy=dict(type='CrossEntropyLoss',
+                                         use_sigmoid=True, loss_weight=2.0, reduction='sum'),
+                            loss_wh=dict(type='MSELoss', loss_weight=2.0, reduction='sum')),
+
+             # training and testing settings
+             train_cfg=dict(
+                            assigner=dict(type='GridAssigner',
+                                          pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0)),
+             test_cfg=dict(nms_pre=1000,
+                           min_bbox_size=0,
+                           score_thr=0.05,
+                           conf_thr=0.005,
+                           nms=dict(type='nms', iou_threshold=0.45),
+                           max_per_img=100))
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Expand',
+         mean=data_preprocessor['mean'],
+         to_rgb=data_preprocessor['bgr_to_rgb'],
+         ratio_range=(1, 2)),
+    dict(type='MinIoURandomCrop',
+         min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
+         min_crop_size=0.3),
+    dict(type='RandomResize', scale=[(320, 320), (608, 608)], keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackDetInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(608, 608), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='PackDetInputs',
+         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')),
+]
+
+train_dataloader=dict(
+    batch_size=8,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(type=dataset_type,
+                 data_root=data_root,
+                 ann_file='annotations/instances_train2017.json',
+                 data_prefix=dict(img='train2017/'),
+                 filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                 pipeline=train_pipeline,
+                 backend_args=backend_args))
+
+val_dataloader = dict(
+                      batch_size=1,
+                      num_workers=2,
+                      persistent_workers=True,
+                      drop_last=False,
+                      sampler=dict(type='DefaultSampler', shuffle=False),
+                      dataset=dict(type=dataset_type,
+                                   data_root=data_root,
+                                   ann_file='annotations/instances_val2017.json',
+                                   data_prefix=dict(img='val2017/'),
+                                   test_mode=True,
+                                   pipeline=test_pipeline,
+                                   backend_args=backend_args))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='CocoMetric',
+                     ann_file=data_root + 'annotations/instances_val2017.json',
+                     metric='bbox',
+                     backend_args=backend_args)
+test_evaluator = val_evaluator
+
+train_cfg = dict(max_epochs=273, val_interval=7)
+
+# optimizer
+optim_wrapper = dict(type='OptimWrapper',
+                     optimizer=dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0005),
+                     clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning policy
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=2000),
+    dict(type='MultiStepLR', by_epoch=True, milestones=[218, 246], gamma=0.1),
+]
+
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=7))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr=dict(base_batch_size=64)
diff --git a/config/mmdet_dd-person_mask2former-v3.py b/config/mmdet_dd-person_mask2former-v3.py
new file mode 100644
index 0000000..375d45d
--- /dev/null
+++ b/config/mmdet_dd-person_mask2former-v3.py
@@ -0,0 +1,105 @@
+_base_ = ['./mask2former_r50_8xb2-lsj-50e_coco-panoptic.py']
+
+num_things_classes = 1
+num_stuff_classes = 0
+num_classes = num_things_classes + num_stuff_classes
+image_size = (1024, 1024)
+batch_augments = [
+    dict(
+        type='BatchFixedSizePad',
+        size=image_size,
+        img_pad_value=0,
+        pad_mask=True,
+        mask_pad_value=0,
+        pad_seg=False,
+    )
+]
+data_preprocessor = dict(
+    type='DetDataPreprocessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_size_divisor=32,
+    pad_mask=True,
+    mask_pad_value=0,
+    pad_seg=False,
+    batch_augments=batch_augments,
+)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    panoptic_head=dict(
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        loss_cls=dict(class_weight=[1.0] * num_classes + [0.1]),
+    ),
+    panoptic_fusion_head=dict(
+        num_things_classes=num_things_classes, num_stuff_classes=num_stuff_classes
+    ),
+    test_cfg=dict(panoptic_on=False),
+)
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True, backend_args=None),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', prob=0.5),
+    # large scale jittering
+    dict(
+        type='RandomResize',
+        scale=image_size,
+        ratio_range=(0.1, 2.0),
+        resize_type='Resize',
+        keep_ratio=True,
+    ),
+    dict(
+        type='RandomCrop',
+        crop_size=image_size,
+        crop_type='absolute',
+        recompute_bbox=True,
+        allow_negative_crop=True,
+    ),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-5, 1e-5), by_mask=True),
+    dict(type='PackDetInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True, backend_args=None),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor'),
+    ),
+]
+
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+train_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline,
+    )
+)
+val_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        pipeline=test_pipeline,
+    )
+)
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=None,
+)
+test_evaluator = val_evaluator

From 131f182431b031764d27a2ac2973b87e0d78b811 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Thu, 4 May 2023 12:32:58 +0900
Subject: [PATCH 05/11] update for mmdet v3

---
 scripts/ddetailer.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/scripts/ddetailer.py b/scripts/ddetailer.py
index 1fdfa1a..6534c88 100644
--- a/scripts/ddetailer.py
+++ b/scripts/ddetailer.py
@@ -58,17 +58,23 @@ def startup():
         load_file_from_url("https://huggingface.co/dustysys/ddetailer/resolve/main/mmdet/segm/mmdet_dd-person_mask2former.pth", segm_path)
 
     import torch
+    legacy = torch.__version__.split(".")[0] < "2"
     print("Check config files...")
     config_dir = os.path.join(scripts.basedir(), "config")
-    configs = [ "mmdet_anime-face_yolov3.py", "mmdet_dd-person_mask2former.py" ]
+    if legacy:
+        configs = [ "mmdet_anime-face_yolov3.py", "mmdet_dd-person_mask2former.py" ]
+    else:
+        configs = [ "mmdet_anime-face_yolov3-v3.py", "mmdet_dd-person_mask2former-v3.py", "mask2former_r50_8xb2-lsj-50e_coco-panoptic.py", "coco_panoptic.py" ]
 
     destdir = bbox_path
     for confpy in configs:
         conf = os.path.join(config_dir, confpy)
+        if not legacy:
+            confpy = confpy.replace("-v3.py", ".py")
         dest = os.path.join(destdir, confpy)
         if not os.path.exists(dest):
             print(f"Copy config file: {confpy}..")
-            shutil.copy(conf, destdir)
+            shutil.copy(conf, dest)
         destdir = segm_path
 
     print("Done")

From ce29b9a93af6cb4e911563c5286df38f0ce842d3 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Thu, 4 May 2023 14:21:17 +0900
Subject: [PATCH 06/11] check the pytorch version and conditially install mmdet

---
 scripts/ddetailer.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/scripts/ddetailer.py b/scripts/ddetailer.py
index 6534c88..64c94b5 100644
--- a/scripts/ddetailer.py
+++ b/scripts/ddetailer.py
@@ -44,11 +44,17 @@ def modeltitle(path, shorthash):
 
 def startup():
     from launch import is_installed, run
+    import torch
+    legacy = torch.__version__.split(".")[0] < "2"
     if not is_installed("mmdet"):
         python = sys.executable
         run(f'"{python}" -m pip install -U openmim', desc="Installing openmim", errdesc="Couldn't install openmim")
-        run(f'"{python}" -m mim install mmcv-full', desc=f"Installing mmcv-full", errdesc=f"Couldn't install mmcv-full")
-        run(f'"{python}" -m pip install mmdet', desc=f"Installing mmdet", errdesc=f"Couldn't install mmdet")
+        if legacy:
+            run(f'"{python}" -m mim install mmcv-full', desc=f"Installing mmcv-full", errdesc=f"Couldn't install mmcv-full")
+            run(f'"{python}" -m pip install mmdet==2.28.2', desc=f"Installing mmdet", errdesc=f"Couldn't install mmdet")
+        else:
+            run(f'"{python}" -m mim install mmcv>==2.0.0', desc=f"Installing mmcv", errdesc=f"Couldn't install mmcv")
+            run(f'"{python}" -m pip install mmdet', desc=f"Installing mmdet", errdesc=f"Couldn't install mmdet")
 
     bbox_path = os.path.join(dd_models_path, "bbox")
     segm_path = os.path.join(dd_models_path, "segm")
@@ -57,8 +63,6 @@ def startup():
         load_file_from_url("https://huggingface.co/dustysys/ddetailer/resolve/main/mmdet/bbox/mmdet_anime-face_yolov3.pth", bbox_path)
         load_file_from_url("https://huggingface.co/dustysys/ddetailer/resolve/main/mmdet/segm/mmdet_dd-person_mask2former.pth", segm_path)
 
-    import torch
-    legacy = torch.__version__.split(".")[0] < "2"
     print("Check config files...")
     config_dir = os.path.join(scripts.basedir(), "config")
     if legacy:

From afeb6aa4203d08bc5fb90bd6926e2f45053eb895 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Fri, 5 May 2023 00:04:34 +0900
Subject: [PATCH 07/11] specify the mmdet version

---
 scripts/ddetailer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ddetailer.py b/scripts/ddetailer.py
index 64c94b5..b21b7d6 100644
--- a/scripts/ddetailer.py
+++ b/scripts/ddetailer.py
@@ -54,7 +54,7 @@ def startup():
             run(f'"{python}" -m pip install mmdet==2.28.2', desc=f"Installing mmdet", errdesc=f"Couldn't install mmdet")
         else:
             run(f'"{python}" -m mim install mmcv>==2.0.0', desc=f"Installing mmcv", errdesc=f"Couldn't install mmcv")
-            run(f'"{python}" -m pip install mmdet', desc=f"Installing mmdet", errdesc=f"Couldn't install mmdet")
+            run(f'"{python}" -m pip install mmdet>=3', desc=f"Installing mmdet", errdesc=f"Couldn't install mmdet")
 
     bbox_path = os.path.join(dd_models_path, "bbox")
     segm_path = os.path.join(dd_models_path, "segm")

From 754c0b3601d53a7f883aedecfd6fa4ebfbf339b2 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Fri, 5 May 2023 00:05:08 +0900
Subject: [PATCH 08/11] update num_things_classes=80 for the new model file

---
 config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py | 2 +-
 config/mmdet_dd-person_mask2former-v3.py             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py b/config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py
index b67d9b0..882ef7b 100644
--- a/config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py
+++ b/config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py
@@ -24,7 +24,7 @@
     batch_augments=batch_augments,
 )
 
-num_things_classes = 1
+num_things_classes = 80
 num_stuff_classes = 0
 num_classes = num_things_classes + num_stuff_classes
 model = dict(
diff --git a/config/mmdet_dd-person_mask2former-v3.py b/config/mmdet_dd-person_mask2former-v3.py
index 375d45d..ab1cfc3 100644
--- a/config/mmdet_dd-person_mask2former-v3.py
+++ b/config/mmdet_dd-person_mask2former-v3.py
@@ -1,6 +1,6 @@
 _base_ = ['./mask2former_r50_8xb2-lsj-50e_coco-panoptic.py']
 
-num_things_classes = 1
+num_things_classes = 80
 num_stuff_classes = 0
 num_classes = num_things_classes + num_stuff_classes
 image_size = (1024, 1024)

From 9238cf84d423eb1709db0a23f004d4e6d5f6f883 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Fri, 5 May 2023 00:12:52 +0900
Subject: [PATCH 09/11] download the segm model file from openmmlab.com for
 mmdet v3

Please see https://github.com/open-mmlab/mmdetection/tree/main/configs/mask2former#instance-segmentation
for R-50 backbone
---
 scripts/ddetailer.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/scripts/ddetailer.py b/scripts/ddetailer.py
index b21b7d6..a58c736 100644
--- a/scripts/ddetailer.py
+++ b/scripts/ddetailer.py
@@ -61,7 +61,13 @@ def startup():
     if (len(list_models(dd_models_path)) == 0):
         print("No detection models found, downloading...")
         load_file_from_url("https://huggingface.co/dustysys/ddetailer/resolve/main/mmdet/bbox/mmdet_anime-face_yolov3.pth", bbox_path)
-        load_file_from_url("https://huggingface.co/dustysys/ddetailer/resolve/main/mmdet/segm/mmdet_dd-person_mask2former.pth", segm_path)
+        if legacy:
+            load_file_from_url("https://huggingface.co/dustysys/ddetailer/resolve/main/mmdet/segm/mmdet_dd-person_mask2former.pth", segm_path)
+        else:
+            load_file_from_url(
+                "https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_r50_8xb2-lsj-50e_coco/mask2former_r50_8xb2-lsj-50e_coco_20220506_191028-41b088b6.pth",
+                segm_path,
+                file_name="mmdet_dd-person_mask2former.pth")
 
     print("Check config files...")
     config_dir = os.path.join(scripts.basedir(), "config")

From b8ec582900a8558fe7d5613ebdef0152ab1ac351 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Fri, 28 Apr 2023 12:29:11 +0900
Subject: [PATCH 10/11] make postprocess() extension

---
 scripts/ddetailer.py | 254 +++++++++++++++++++++++++------------------
 1 file changed, 151 insertions(+), 103 deletions(-)

diff --git a/scripts/ddetailer.py b/scripts/ddetailer.py
index a58c736..107e7af 100644
--- a/scripts/ddetailer.py
+++ b/scripts/ddetailer.py
@@ -6,6 +6,7 @@
 import gradio as gr
 import shutil
 
+from copy import copy
 from modules import processing, images
 from modules import scripts, script_callbacks, shared, devices, modelloader
 from modules.processing import Processed, StableDiffusionProcessingTxt2Img, StableDiffusionProcessingImg2Img
@@ -95,100 +96,132 @@ def gr_show(visible=True):
     return {"visible": visible, "__type__": "update"}
 
 class DetectionDetailerScript(scripts.Script):
+    def __init__(self):
+        super().__init__()
+
     def title(self):
         return "Detection Detailer"
     
     def show(self, is_img2img):
-        return True
+        return scripts.AlwaysVisible
 
     def ui(self, is_img2img):
         import modules.ui
 
-        model_list = list_models(dd_models_path)
-        model_list.insert(0, "None")
-        if is_img2img:
-            info = gr.HTML("<p style=\"margin-bottom:0.75em\">Recommended settings: Use from inpaint tab, inpaint at full res ON, denoise <0.5</p>")
-        else:
-            info = gr.HTML("")
-        with gr.Group():
-            with gr.Row():
-                dd_model_a = gr.Dropdown(label="Primary detection model (A)", choices=model_list,value = "None", visible=True, type="value")
-            
+        with gr.Accordion("Detection Detailer", open=False):
             with gr.Row():
-                dd_conf_a = gr.Slider(label='Detection confidence threshold % (A)', minimum=0, maximum=100, step=1, value=30, visible=False)
-                dd_dilation_factor_a = gr.Slider(label='Dilation factor (A)', minimum=0, maximum=255, step=1, value=4, visible=False)
+                enabled = gr.Checkbox(label="Enable", value=False, visible=True)
 
-            with gr.Row():
-                dd_offset_x_a = gr.Slider(label='X offset (A)', minimum=-200, maximum=200, step=1, value=0, visible=False)
-                dd_offset_y_a = gr.Slider(label='Y offset (A)', minimum=-200, maximum=200, step=1, value=0, visible=False)
-            
-            with gr.Row():
-                dd_preprocess_b = gr.Checkbox(label='Inpaint model B detections before model A runs', value=False, visible=False)
-                dd_bitwise_op = gr.Radio(label='Bitwise operation', choices=['None', 'A&B', 'A-B'], value="None", visible=False)  
-        
-        br = gr.HTML("<br>")
+            model_list = list_models(dd_models_path)
+            model_list.insert(0, "None")
+            if is_img2img:
+                info = gr.HTML("<p style=\"margin-bottom:0.75em\">Recommended settings: Use from inpaint tab, inpaint at full res ON, denoise <0.5</p>")
+            else:
+                info = gr.HTML("")
+            with gr.Group():
+                with gr.Row():
+                    dd_model_a = gr.Dropdown(label="Primary detection model (A)", choices=model_list,value = "None", visible=True, type="value")
+
+                with gr.Row():
+                    dd_conf_a = gr.Slider(label='Detection confidence threshold % (A)', minimum=0, maximum=100, step=1, value=30, visible=False)
+                    dd_dilation_factor_a = gr.Slider(label='Dilation factor (A)', minimum=0, maximum=255, step=1, value=4, visible=False)
+
+                with gr.Row():
+                    dd_offset_x_a = gr.Slider(label='X offset (A)', minimum=-200, maximum=200, step=1, value=0, visible=False)
+                    dd_offset_y_a = gr.Slider(label='Y offset (A)', minimum=-200, maximum=200, step=1, value=0, visible=False)
+
+                with gr.Row():
+                    dd_preprocess_b = gr.Checkbox(label='Inpaint model B detections before model A runs', value=False, visible=False)
+                    dd_bitwise_op = gr.Radio(label='Bitwise operation', choices=['None', 'A&B', 'A-B'], value="None", visible=False)
+
+            br = gr.HTML("<br>")
+
+            with gr.Group():
+                with gr.Row():
+                    dd_model_b = gr.Dropdown(label="Secondary detection model (B) (optional)", choices=model_list,value = "None", visible =False, type="value")
+
+                with gr.Row():
+                    dd_conf_b = gr.Slider(label='Detection confidence threshold % (B)', minimum=0, maximum=100, step=1, value=30, visible=False)
+                    dd_dilation_factor_b = gr.Slider(label='Dilation factor (B)', minimum=0, maximum=255, step=1, value=4, visible=False)
+
+                with gr.Row():
+                    dd_offset_x_b = gr.Slider(label='X offset (B)', minimum=-200, maximum=200, step=1, value=0, visible=False)
+                    dd_offset_y_b = gr.Slider(label='Y offset (B)', minimum=-200, maximum=200, step=1, value=0, visible=False)
+
+            with gr.Group():
+                with gr.Row():
+                    dd_mask_blur = gr.Slider(label='Mask blur ', minimum=0, maximum=64, step=1, value=4, visible=(not is_img2img))
+                    dd_denoising_strength = gr.Slider(label='Denoising strength (Inpaint)', minimum=0.0, maximum=1.0, step=0.01, value=0.4, visible=(not is_img2img))
+
+                with gr.Row():
+                    dd_inpaint_full_res = gr.Checkbox(label='Inpaint at full resolution ', value=True, visible = (not is_img2img))
+                    dd_inpaint_full_res_padding = gr.Slider(label='Inpaint at full resolution padding, pixels ', minimum=0, maximum=256, step=4, value=32, visible=(not is_img2img))
+
+            dd_model_a.change(
+                lambda modelname: {
+                    dd_model_b:gr_show( modelname != "None" ),
+                    dd_conf_a:gr_show( modelname != "None" ),
+                    dd_dilation_factor_a:gr_show( modelname != "None"),
+                    dd_offset_x_a:gr_show( modelname != "None" ),
+                    dd_offset_y_a:gr_show( modelname != "None" )
+
+                },
+                inputs= [dd_model_a],
+                outputs =[dd_model_b, dd_conf_a, dd_dilation_factor_a, dd_offset_x_a, dd_offset_y_a]
+            )
+
+            dd_model_b.change(
+                lambda modelname: {
+                    dd_preprocess_b:gr_show( modelname != "None" ),
+                    dd_bitwise_op:gr_show( modelname != "None" ),
+                    dd_conf_b:gr_show( modelname != "None" ),
+                    dd_dilation_factor_b:gr_show( modelname != "None"),
+                    dd_offset_x_b:gr_show( modelname != "None" ),
+                    dd_offset_y_b:gr_show( modelname != "None" )
+                },
+                inputs= [dd_model_b],
+                outputs =[dd_preprocess_b, dd_bitwise_op, dd_conf_b, dd_dilation_factor_b, dd_offset_x_b, dd_offset_y_b]
+            )
+
+            return [info, enabled,
+                    dd_model_a,
+                    dd_conf_a, dd_dilation_factor_a,
+                    dd_offset_x_a, dd_offset_y_a,
+                    dd_preprocess_b, dd_bitwise_op,
+                    br,
+                    dd_model_b,
+                    dd_conf_b, dd_dilation_factor_b,
+                    dd_offset_x_b, dd_offset_y_b,
+                    dd_mask_blur, dd_denoising_strength,
+                    dd_inpaint_full_res, dd_inpaint_full_res_padding
+            ]
+
+    def get_seed(self, p) -> tuple[int, int]:
+        i = p.iteration
+
+        if not p.all_seeds:
+            seed = p.seed
+        elif i < len(p.all_seeds):
+            seed = p.all_seeds[i]
+        else:
+            j = i % len(p.all_seeds)
+            seed = p.all_seeds[j]
 
-        with gr.Group():
-            with gr.Row():
-                dd_model_b = gr.Dropdown(label="Secondary detection model (B) (optional)", choices=model_list,value = "None", visible =False, type="value")
+        if not p.all_subseeds:
+            subseed = p.subseed
+        elif i < len(p.all_subseeds):
+            subseed = p.all_subseeds[i]
+        else:
+            j = i % len(p.all_subseeds)
+            subseed = p.all_subseeds[j]
 
-            with gr.Row():
-                dd_conf_b = gr.Slider(label='Detection confidence threshold % (B)', minimum=0, maximum=100, step=1, value=30, visible=False)
-                dd_dilation_factor_b = gr.Slider(label='Dilation factor (B)', minimum=0, maximum=255, step=1, value=4, visible=False)
-            
-            with gr.Row():
-                dd_offset_x_b = gr.Slider(label='X offset (B)', minimum=-200, maximum=200, step=1, value=0, visible=False)
-                dd_offset_y_b = gr.Slider(label='Y offset (B)', minimum=-200, maximum=200, step=1, value=0, visible=False)
-        
-        with gr.Group():
-            with gr.Row():
-                dd_mask_blur = gr.Slider(label='Mask blur ', minimum=0, maximum=64, step=1, value=4, visible=(not is_img2img))
-                dd_denoising_strength = gr.Slider(label='Denoising strength (Inpaint)', minimum=0.0, maximum=1.0, step=0.01, value=0.4, visible=(not is_img2img))
-            
-            with gr.Row():
-                dd_inpaint_full_res = gr.Checkbox(label='Inpaint at full resolution ', value=True, visible = (not is_img2img))
-                dd_inpaint_full_res_padding = gr.Slider(label='Inpaint at full resolution padding, pixels ', minimum=0, maximum=256, step=4, value=32, visible=(not is_img2img))
-
-        dd_model_a.change(
-            lambda modelname: {
-                dd_model_b:gr_show( modelname != "None" ),
-                dd_conf_a:gr_show( modelname != "None" ),
-                dd_dilation_factor_a:gr_show( modelname != "None"),
-                dd_offset_x_a:gr_show( modelname != "None" ),
-                dd_offset_y_a:gr_show( modelname != "None" )
-
-            },
-            inputs= [dd_model_a],
-            outputs =[dd_model_b, dd_conf_a, dd_dilation_factor_a, dd_offset_x_a, dd_offset_y_a]
-        )
-
-        dd_model_b.change(
-            lambda modelname: {
-                dd_preprocess_b:gr_show( modelname != "None" ),
-                dd_bitwise_op:gr_show( modelname != "None" ),
-                dd_conf_b:gr_show( modelname != "None" ),
-                dd_dilation_factor_b:gr_show( modelname != "None"),
-                dd_offset_x_b:gr_show( modelname != "None" ),
-                dd_offset_y_b:gr_show( modelname != "None" )
-            },
-            inputs= [dd_model_b],
-            outputs =[dd_preprocess_b, dd_bitwise_op, dd_conf_b, dd_dilation_factor_b, dd_offset_x_b, dd_offset_y_b]
-        )
-        
-        return [info,
-                dd_model_a, 
-                dd_conf_a, dd_dilation_factor_a,
-                dd_offset_x_a, dd_offset_y_a,
-                dd_preprocess_b, dd_bitwise_op, 
-                br,
-                dd_model_b,
-                dd_conf_b, dd_dilation_factor_b,
-                dd_offset_x_b, dd_offset_y_b,  
-                dd_mask_blur, dd_denoising_strength,
-                dd_inpaint_full_res, dd_inpaint_full_res_padding
-        ]
+        return seed, subseed
+
+    def process(self, p, *args):
+        if getattr(p, "_disable_ddetailer", False):
+            return
 
-    def run(self, p, info,
+    def postprocess_image(self, p, pp, info, enabled,
                      dd_model_a, 
                      dd_conf_a, dd_dilation_factor_a,
                      dd_offset_x_a, dd_offset_y_a,
@@ -200,21 +233,27 @@ def run(self, p, info,
                      dd_mask_blur, dd_denoising_strength,
                      dd_inpaint_full_res, dd_inpaint_full_res_padding):
 
-        processing.fix_seed(p)
+        if getattr(p, "_disable_ddetailer", False):
+            return
+
+        if not enabled:
+            return
+
         initial_info = None
-        seed = p.seed
-        p.batch_size = 1
-        ddetail_count = p.n_iter
-        p.n_iter = 1
-        p.do_not_save_grid = True
-        p.do_not_save_samples = True
+        seed, subseed = self.get_seed(p)
+        p.seed = seed
+        p.subseed = subseed
+
+        info = ""
+        ddetail_count = 1
+
         is_txt2img = isinstance(p, StableDiffusionProcessingTxt2Img)
+        p_txt = copy(p)
         if (not is_txt2img):
             orig_image = p.init_images[0]
         else:
-            p_txt = p
             p = StableDiffusionProcessingImg2Img(
-                    init_images = None,
+                    init_images = [pp.image],
                     resize_mode = 0,
                     denoising_strength = dd_denoising_strength,
                     mask = None,
@@ -235,7 +274,8 @@ def run(self, p, info,
                     seed_resize_from_h=p_txt.seed_resize_from_h,
                     seed_resize_from_w=p_txt.seed_resize_from_w,
                     sampler_name=p_txt.sampler_name,
-                    n_iter=p_txt.n_iter,
+                    batch_size=1,
+                    n_iter=1,
                     steps=p_txt.steps,
                     cfg_scale=p_txt.cfg_scale,
                     width=p_txt.width,
@@ -244,16 +284,18 @@ def run(self, p, info,
                 )
             p.do_not_save_grid = True
             p.do_not_save_samples = True
+
+        p._disable_ddetailer = True
+
         output_images = []
         state.job_count = ddetail_count
         for n in range(ddetail_count):
             devices.torch_gc()
             start_seed = seed + n
             if ( is_txt2img ):
-                print(f"Processing initial image for output generation {n + 1}.")
-                p_txt.seed = start_seed
-                processed = processing.process_images(p_txt)
-                init_image = processed.images[0]   
+                print(f"Prepare initial image for output generation {p_txt.iteration + 1}.")
+                init_image = copy(pp.image)
+                info = processing.create_infotext(p_txt, p_txt.all_prompts, p_txt.all_seeds, p_txt.all_subseeds, None, 0, 0)
             else: 
                 init_image = orig_image
             
@@ -276,7 +318,7 @@ def run(self, p, info,
                         images.save_image(segmask_preview_b, opts.outdir_ddetailer_previews, "", start_seed, p.prompt, opts.samples_format, p=p)
                     gen_count = len(masks_b_pre)
                     state.job_count += gen_count
-                    print(f"Processing {gen_count} model {label_b_pre} detections for output generation {n + 1}.")
+                    print(f"Processing {gen_count} model {label_b_pre} detections for output generation {p_txt.iteration + 1}.")
                     p.seed = start_seed
                     p.init_images = [init_image]
 
@@ -285,7 +327,12 @@ def run(self, p, info,
                         if ( opts.dd_save_masks):
                             images.save_image(masks_b_pre[i], opts.outdir_ddetailer_masks, "", start_seed, p.prompt, opts.samples_format, p=p)
                         processed = processing.process_images(p)
+
+                        if not is_txt2img:
+                            p.prompt = processed.all_prompts[0]
+                            info = processed.info
                         p.seed = processed.seed + 1
+                        p.subseed = processed.subseed + 1
                         p.init_images = processed.images
 
                     if (gen_count > 0):
@@ -293,7 +340,7 @@ def run(self, p, info,
                         init_image = processed.images[0]
 
                 else:
-                    print(f"No model B detections for output generation {n} with current settings.")
+                    print(f"No model B detections for output generation {p_txt.iteration + 1} with current settings.")
 
             # Primary run
             if (dd_model_a != "None"):
@@ -335,7 +382,7 @@ def run(self, p, info,
                         images.save_image(segmask_preview_a, opts.outdir_ddetailer_previews, "", start_seed, p.prompt, opts.samples_format, p=p)
                     gen_count = len(masks_a)
                     state.job_count += gen_count
-                    print(f"Processing {gen_count} model {label_a} detections for output generation {n + 1}.")
+                    print(f"Processing {gen_count} model {label_a} detections for output generation {p_txt.iteration + 1}.")
                     p.seed = start_seed
                     p.init_images = [init_image]
 
@@ -347,21 +394,22 @@ def run(self, p, info,
                         processed = processing.process_images(p)
                         if initial_info is None:
                             initial_info = processed.info
+                        info = processed.info
                         p.seed = processed.seed + 1
+                        p.subseed = processed.subseed + 1
                         p.init_images = processed.images
                     
                     if (gen_count > 0):
                         output_images[n] = processed.images[0]
-                        if ( opts.samples_save ):
-                            images.save_image(processed.images[0], p.outpath_samples, "", start_seed, p.prompt, opts.samples_format, info=initial_info, p=p)
   
                 else: 
-                    print(f"No model {label_a} detections for output generation {n} with current settings.")
-            state.job = f"Generation {n + 1} out of {state.job_count}"
+                    print(f"No model {label_a} detections for output generation {p_txt.iteration + 1} with current settings.")
+            state.job = f"Generation {p_txt.iteration + 1} out of {state.job_count}"
         if (initial_info is None):
             initial_info = "No detections found."
 
-        return Processed(p, output_images, seed, initial_info)
+        if len(output_images) > 0:
+            pp.image = output_images[0]
 
 def modeldataset(model_shortname):
     path = modelpath(model_shortname)

From d1a9108e3ca76be8b41965afd55f54b3fe3a992a Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Sat, 20 May 2023 20:37:22 +0900
Subject: [PATCH 11/11] no need to check is_txt2img. cleanup

---
 scripts/ddetailer.py | 82 +++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 46 deletions(-)

diff --git a/scripts/ddetailer.py b/scripts/ddetailer.py
index 107e7af..bf61cff 100644
--- a/scripts/ddetailer.py
+++ b/scripts/ddetailer.py
@@ -247,43 +247,40 @@ def postprocess_image(self, p, pp, info, enabled,
         info = ""
         ddetail_count = 1
 
-        is_txt2img = isinstance(p, StableDiffusionProcessingTxt2Img)
         p_txt = copy(p)
-        if (not is_txt2img):
-            orig_image = p.init_images[0]
-        else:
-            p = StableDiffusionProcessingImg2Img(
-                    init_images = [pp.image],
-                    resize_mode = 0,
-                    denoising_strength = dd_denoising_strength,
-                    mask = None,
-                    mask_blur= dd_mask_blur,
-                    inpainting_fill = 1,
-                    inpaint_full_res = dd_inpaint_full_res,
-                    inpaint_full_res_padding= dd_inpaint_full_res_padding,
-                    inpainting_mask_invert= 0,
-                    sd_model=p_txt.sd_model,
-                    outpath_samples=p_txt.outpath_samples,
-                    outpath_grids=p_txt.outpath_grids,
-                    prompt=p_txt.prompt,
-                    negative_prompt=p_txt.negative_prompt,
-                    styles=p_txt.styles,
-                    seed=p_txt.seed,
-                    subseed=p_txt.subseed,
-                    subseed_strength=p_txt.subseed_strength,
-                    seed_resize_from_h=p_txt.seed_resize_from_h,
-                    seed_resize_from_w=p_txt.seed_resize_from_w,
-                    sampler_name=p_txt.sampler_name,
-                    batch_size=1,
-                    n_iter=1,
-                    steps=p_txt.steps,
-                    cfg_scale=p_txt.cfg_scale,
-                    width=p_txt.width,
-                    height=p_txt.height,
-                    tiling=p_txt.tiling,
-                )
-            p.do_not_save_grid = True
-            p.do_not_save_samples = True
+
+        p = StableDiffusionProcessingImg2Img(
+                init_images = [pp.image],
+                resize_mode = 0,
+                denoising_strength = dd_denoising_strength,
+                mask = None,
+                mask_blur= dd_mask_blur,
+                inpainting_fill = 1,
+                inpaint_full_res = dd_inpaint_full_res,
+                inpaint_full_res_padding= dd_inpaint_full_res_padding,
+                inpainting_mask_invert= 0,
+                sd_model=p_txt.sd_model,
+                outpath_samples=p_txt.outpath_samples,
+                outpath_grids=p_txt.outpath_grids,
+                prompt=p_txt.prompt,
+                negative_prompt=p_txt.negative_prompt,
+                styles=p_txt.styles,
+                seed=p_txt.seed,
+                subseed=p_txt.subseed,
+                subseed_strength=p_txt.subseed_strength,
+                seed_resize_from_h=p_txt.seed_resize_from_h,
+                seed_resize_from_w=p_txt.seed_resize_from_w,
+                sampler_name=p_txt.sampler_name,
+                batch_size=1,
+                n_iter=1,
+                steps=p_txt.steps,
+                cfg_scale=p_txt.cfg_scale,
+                width=p_txt.width,
+                height=p_txt.height,
+                tiling=p_txt.tiling,
+            )
+        p.do_not_save_grid = True
+        p.do_not_save_samples = True
 
         p._disable_ddetailer = True
 
@@ -292,13 +289,9 @@ def postprocess_image(self, p, pp, info, enabled,
         for n in range(ddetail_count):
             devices.torch_gc()
             start_seed = seed + n
-            if ( is_txt2img ):
-                print(f"Prepare initial image for output generation {p_txt.iteration + 1}.")
-                init_image = copy(pp.image)
-                info = processing.create_infotext(p_txt, p_txt.all_prompts, p_txt.all_seeds, p_txt.all_subseeds, None, 0, 0)
-            else: 
-                init_image = orig_image
-            
+            init_image = copy(pp.image)
+            info = processing.create_infotext(p_txt, p_txt.all_prompts, p_txt.all_seeds, p_txt.all_subseeds, None, 0, 0)
+
             output_images.append(init_image)
             masks_a = []
             masks_b_pre = []
@@ -328,9 +321,6 @@ def postprocess_image(self, p, pp, info, enabled,
                             images.save_image(masks_b_pre[i], opts.outdir_ddetailer_masks, "", start_seed, p.prompt, opts.samples_format, p=p)
                         processed = processing.process_images(p)
 
-                        if not is_txt2img:
-                            p.prompt = processed.all_prompts[0]
-                            info = processed.info
                         p.seed = processed.seed + 1
                         p.subseed = processed.subseed + 1
                         p.init_images = processed.images