From 86e2f125668811e9751fd6b49842cae995fc644b Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Wed, 3 May 2023 23:03:42 +0900 Subject: [PATCH 01/11] add configs from original repo https://huggingface.co/dustysys/ddetailer/blob/main/mmdet --- config/mmdet_anime-face_yolov3.py | 47 ++++ config/mmdet_dd-person_mask2former.py | 335 ++++++++++++++++++++++++++ 2 files changed, 382 insertions(+) create mode 100644 config/mmdet_anime-face_yolov3.py create mode 100644 config/mmdet_dd-person_mask2former.py diff --git a/config/mmdet_anime-face_yolov3.py b/config/mmdet_anime-face_yolov3.py new file mode 100644 index 0000000..c644633 --- /dev/null +++ b/config/mmdet_anime-face_yolov3.py @@ -0,0 +1,47 @@ +model = dict(type='YOLOV3', + backbone=dict(type='Darknet', depth=53, out_indices=(3, 4, 5)), + neck=dict(type='YOLOV3Neck', + num_scales=3, + in_channels=[1024, 512, 256], + out_channels=[512, 256, 128]), + bbox_head=dict(type='YOLOV3Head', + num_classes=1, + in_channels=[512, 256, 128], + out_channels=[1024, 512, 256], + anchor_generator=dict(type='YOLOAnchorGenerator', + base_sizes=[[(116, 90), + (156, 198), + (373, 326)], + [(30, 61), + (62, 45), + (59, 119)], + [(10, 13), + (16, 30), + (33, 23)]], + strides=[32, 16, 8]), + bbox_coder=dict(type='YOLOBBoxCoder'), + featmap_strides=[32, 16, 8]), + test_cfg=dict(nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + conf_thr=0.005, + nms=dict(type='nms', iou_threshold=0.45), + max_per_img=100)) +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='MultiScaleFlipAug', + img_scale=(608, 608), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', + mean=[0, 0, 0], + std=[255.0, 255.0, 255.0], + to_rgb=True), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) +] +data = dict(test=dict(pipeline=test_pipeline)) diff --git a/config/mmdet_dd-person_mask2former.py b/config/mmdet_dd-person_mask2former.py new file mode 100644 index 0000000..2abc201 --- /dev/null +++ b/config/mmdet_dd-person_mask2former.py @@ -0,0 +1,335 @@ +dataset_type = 'CocoDataset' +data_root = 'data/dd-person_mask2former/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='Resize', + img_scale=(1024, 1024), + ratio_range=(0.1, 2.0), + multiscale_mode='range', + keep_ratio=True), + dict( + type='RandomCrop', + crop_size=(1024, 1024), + crop_type='absolute', + recompute_bbox=True, + allow_negative_crop=True), + dict( + type='FilterAnnotations', min_gt_bbox_wh=(1e-05, 1e-05), by_mask=True), + dict( + type='Pad', + size=(1024, 1024), + pad_val=dict(img=(128, 128, 128), masks=0, seg=255)), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='DefaultFormatBundle', img_to_float=True), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict( + type='Pad', + size_divisor=32, + pad_val=dict(img=(128, 128, 128), masks=0, seg=255)), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) +] +data = dict( + samples_per_gpu=1, + workers_per_gpu=1, + train=dict( + type='CocoDataset', + ann_file='data/dd-person_mask2former/annotations/train.json', + img_prefix='data/dd-person_mask2former/train/', + pipeline=[ + dict(type='LoadImageFromFile', to_float32=True), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='Resize', + img_scale=(1024, 1024), + ratio_range=(0.1, 2.0), + multiscale_mode='range', + keep_ratio=True), + dict( + type='RandomCrop', + crop_size=(1024, 1024), + crop_type='absolute', + recompute_bbox=True, + allow_negative_crop=True), + dict( + type='FilterAnnotations', + min_gt_bbox_wh=(1e-05, 1e-05), + by_mask=True), + dict( + type='Pad', + size=(1024, 1024), + pad_val=dict(img=(128, 128, 128), masks=0, seg=255)), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='DefaultFormatBundle', img_to_float=True), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']) + ]), + val=dict( + type='CocoDataset', + ann_file='data/dd-person_mask2former/annotations/val.json', + img_prefix='data/dd-person_mask2former/val/', + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict( + type='Pad', + size_divisor=32, + pad_val=dict(img=(128, 128, 128), masks=0, seg=255)), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) + ]), + test=dict( + type='CocoDataset', + ann_file='data/dd-person_mask2former/annotations/val.json', + img_prefix='data/dd-person_mask2former/val/', + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict( + type='Pad', + size_divisor=32, + pad_val=dict(img=(128, 128, 128), masks=0, seg=255)), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) + ])) +evaluation = dict( + interval=2000, + metric=['bbox', 'segm'], + dynamic_intervals=[(400001, 400000)]) +checkpoint_config = dict( + interval=2000, by_epoch=False, save_last=True, max_keep_ckpts=10) +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook', by_epoch=False), + dict(type='TensorboardLoggerHook', by_epoch=False) + ]) +custom_hooks = [dict(type='NumClassCheckHook')] +dist_params = dict(backend='nccl') +log_level = 'INFO' +load_from = 'checkpoints/mask2former_r50_lsj_8x2_50e_coco_20220506_191028-8e96e88b.pth' +resume_from = 'checkpoints/mask2former_r50_lsj_8x2_50e_coco_20220506_191028-8e96e88b.pth' +workflow = [('train', 2000)] +opencv_num_threads = 0 +mp_start_method = 'fork' +auto_scale_lr = dict(enable=False, base_batch_size=16) +num_things_classes = 1 +num_stuff_classes = 0 +num_classes = 1 +model = dict( + type='Mask2Former', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + panoptic_head=dict( + type='Mask2FormerHead', + in_channels=[256, 512, 1024, 2048], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_things_classes=1, + num_stuff_classes=0, + num_queries=100, + num_transformer_feat_level=3, + pixel_decoder=dict( + type='MSDeformAttnPixelDecoder', + num_outs=3, + norm_cfg=dict(type='GN', num_groups=32), + act_cfg=dict(type='ReLU'), + encoder=dict( + type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=dict( + type='MultiScaleDeformableAttention', + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=False, + norm_cfg=None, + init_cfg=None), + ffn_cfgs=dict( + type='FFN', + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type='ReLU', inplace=True)), + operation_order=('self_attn', 'norm', 'ffn', 'norm')), + init_cfg=None), + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + init_cfg=None), + enforce_decoder_input_project=False, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + transformer_decoder=dict( + type='DetrTransformerDecoder', + return_intermediate=True, + num_layers=9, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=dict( + type='MultiheadAttention', + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=False), + ffn_cfgs=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type='ReLU', inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True), + feedforward_channels=2048, + operation_order=('cross_attn', 'norm', 'self_attn', 'norm', + 'ffn', 'norm')), + init_cfg=None), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=2.0, + reduction='mean', + class_weight=[1.0, 0.1]), + loss_mask=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=5.0), + loss_dice=dict( + type='DiceLoss', + use_sigmoid=True, + activate=True, + reduction='mean', + naive_dice=True, + eps=1.0, + loss_weight=5.0)), + panoptic_fusion_head=dict( + type='MaskFormerFusionHead', + num_things_classes=1, + num_stuff_classes=0, + loss_panoptic=None, + init_cfg=None), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type='MaskHungarianAssigner', + cls_cost=dict(type='ClassificationCost', weight=2.0), + mask_cost=dict( + type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), + dice_cost=dict( + type='DiceCost', weight=5.0, pred_act=True, eps=1.0)), + sampler=dict(type='MaskPseudoSampler')), + test_cfg=dict( + panoptic_on=False, + semantic_on=False, + instance_on=True, + max_per_image=100, + iou_thr=0.8, + filter_low_score=True), + init_cfg=None) +image_size = (1024, 1024) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-08, + betas=(0.9, 0.999), + paramwise_cfg=dict( + custom_keys=dict( + backbone=dict(lr_mult=0.1, decay_mult=1.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0)), + norm_decay_mult=0.0)) +optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2)) +lr_config = dict( + policy='step', + gamma=0.1, + by_epoch=False, + step=[327778, 355092], + warmup='linear', + warmup_by_epoch=False, + warmup_ratio=1.0, + warmup_iters=10) +max_iters = 400000 +runner = dict(type='IterBasedRunner', max_iters=400000) +interval = 2000 +dynamic_intervals = [(400001, 400000)] +pad_cfg = dict(img=(128, 128, 128), masks=0, seg=255) +work_dir = './work_dirs\dd-person_mask2former' +auto_resume = False +gpu_ids = [0] From 01764f01a9af564b7586e8bceb9d1eff2e8d0b42 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Thu, 4 May 2023 00:30:47 +0900 Subject: [PATCH 02/11] support partial update mmdet configs --- scripts/ddetailer.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/scripts/ddetailer.py b/scripts/ddetailer.py index 7841d8e..9d2fe28 100644 --- a/scripts/ddetailer.py +++ b/scripts/ddetailer.py @@ -4,6 +4,7 @@ from PIL import Image import numpy as np import gradio as gr +import shutil from modules import processing, images from modules import scripts, script_callbacks, shared, devices, modelloader @@ -49,14 +50,28 @@ def startup(): run(f'"{python}" -m mim install mmcv-full', desc=f"Installing mmcv-full", errdesc=f"Couldn't install mmcv-full") run(f'"{python}" -m pip install mmdet', desc=f"Installing mmdet", errdesc=f"Couldn't install mmdet") + bbox_path = os.path.join(dd_models_path, "bbox") + segm_path = os.path.join(dd_models_path, "segm") if (len(list_models(dd_models_path)) == 0): print("No detection models found, downloading...") - bbox_path = os.path.join(dd_models_path, "bbox") - segm_path = os.path.join(dd_models_path, "segm") load_file_from_url("https://huggingface.co/dustysys/ddetailer/resolve/main/mmdet/bbox/mmdet_anime-face_yolov3.pth", bbox_path) - load_file_from_url("https://huggingface.co/dustysys/ddetailer/raw/main/mmdet/bbox/mmdet_anime-face_yolov3.py", bbox_path) load_file_from_url("https://huggingface.co/dustysys/ddetailer/resolve/main/mmdet/segm/mmdet_dd-person_mask2former.pth", segm_path) - load_file_from_url("https://huggingface.co/dustysys/ddetailer/raw/main/mmdet/segm/mmdet_dd-person_mask2former.py", segm_path) + + import torch + print("Check config files...") + config_dir = os.path.join(scripts.basedir(), "config") + configs = [ "mmdet_anime-face_yolov3.py", "mmdet_dd-person_mask2former.py" ] + + destdir = bbox_path + for confpy in configs: + conf = os.path.join(config_dir, confpy) + dest = os.path.join(destdir, confpy) + if not os.path.exists(dest): + print(f"Copy config file: {confpy}..") + shutil.copy(conf, destdir) + destdir = segm_path + + print("Done") startup() From 0b8faf9b76605a14971dbf5d3fa44970f7b16d12 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Thu, 4 May 2023 02:33:46 +0900 Subject: [PATCH 03/11] support mmdet v3 based on Bing-su's work --- scripts/ddetailer.py | 116 +++++++++++++++++++++++++++++++++---------- 1 file changed, 90 insertions(+), 26 deletions(-) diff --git a/scripts/ddetailer.py b/scripts/ddetailer.py index 9d2fe28..1fdfa1a 100644 --- a/scripts/ddetailer.py +++ b/scripts/ddetailer.py @@ -372,6 +372,8 @@ def create_segmask_preview(results, image): labels = results[0] bboxes = results[1] segms = results[2] + if not mmcv_legacy: + scores = results[3] cv2_image = np.array(image) cv2_image = cv2_image[:, :, ::-1].copy() @@ -389,7 +391,10 @@ def create_segmask_preview(results, image): cv2_image = np.where(cv2_mask_rgb == 255, color_image, cv2_image) text_color = tuple([int(x) for x in ( color[0][0] - 100 )]) name = labels[i] - score = bboxes[i][4] + if mmcv_legacy: + score = bboxes[i][4] + else: + score = scores[i] score = str(score)[:4] text = name + ":" + score cv2.putText(cv2_image, text, (centroid_x - 30, centroid_y), cv2.FONT_HERSHEY_DUPLEX, 0.4, text_color, 1, cv2.LINE_AA) @@ -470,9 +475,16 @@ def create_segmasks(results): return segmasks import mmcv -from mmdet.core import get_classes -from mmdet.apis import (inference_detector, + +try: + from mmdet.core import get_classes + from mmdet.apis import (inference_detector, init_detector) + mmcv_legacy = True +except ImportError: + from mmdet.evaluation import get_classes + from mmdet.apis import inference_detector, init_detector + mmcv_legacy = False def get_device(): device_id = shared.cmd_opts.device_id @@ -494,27 +506,52 @@ def inference_mmdet_segm(image, modelname, conf_thres, label): model_checkpoint = modelpath(modelname) model_config = os.path.splitext(model_checkpoint)[0] + ".py" model_device = get_device() - model = init_detector(model_config, model_checkpoint, device=model_device) - mmdet_results = inference_detector(model, np.array(image)) - bbox_results, segm_results = mmdet_results + if mmcv_legacy: + model = init_detector(model_config, model_checkpoint, device=model_device) + mmdet_results = inference_detector(model, np.array(image)) + bbox_results, segm_results = mmdet_results + else: + model = init_detector(model_config, model_checkpoint, palette="random", device=model_device) + mmdet_results = inference_detector(model, np.array(image)).pred_instances + bboxes = mmdet_results.bboxes.numpy() + dataset = modeldataset(modelname) classes = get_classes(dataset) - labels = [ - np.full(bbox.shape[0], i, dtype=np.int32) - for i, bbox in enumerate(bbox_results) - ] - n,m = bbox_results[0].shape + if mmcv_legacy: + labels = [ + np.full(bbox.shape[0], i, dtype=np.int32) + for i, bbox in enumerate(bbox_results) + ] + n, m = bbox_results[0].shape + else: + n, m = bboxes.shape if (n == 0): - return [[],[],[]] - labels = np.concatenate(labels) - bboxes = np.vstack(bbox_results) - segms = mmcv.concat_list(segm_results) - filter_inds = np.where(bboxes[:,-1] > conf_thres)[0] - results = [[],[],[]] + if mmcv_legacy: + return [[],[],[]] + else: + return [[],[],[],[]] + + if mmcv_legacy: + labels = np.concatenate(labels) + bboxes = np.vstack(bbox_results) + segms = mmcv.concat_list(segm_results) + + filter_inds = np.where(bboxes[:,-1] > conf_thres)[0] + results = [[],[],[]] + else: + labels = mmdet_results.labels + segms = mmdet_results.masks.numpy() + scores = mmdet_results.scores.numpy() + + filter_inds = np.where(mmdet_results.scores > conf_thres)[0] + results = [[],[],[],[]] + for i in filter_inds: results[0].append(label + "-" + classes[labels[i]]) results[1].append(bboxes[i]) results[2].append(segms[i]) + if not mmcv_legacy: + results[3].append(scores[i]) return results @@ -522,29 +559,56 @@ def inference_mmdet_bbox(image, modelname, conf_thres, label): model_checkpoint = modelpath(modelname) model_config = os.path.splitext(model_checkpoint)[0] + ".py" model_device = get_device() - model = init_detector(model_config, model_checkpoint, device=model_device) - results = inference_detector(model, np.array(image)) + + if mmcv_legacy: + model = init_detector(model_config, model_checkpoint, device=model_device) + results = inference_detector(model, np.array(image)) + else: + model = init_detector(model_config, model_checkpoint, device=model_device, palette="random") + output = inference_detector(model, np.array(image)).pred_instances cv2_image = np.array(image) cv2_image = cv2_image[:, :, ::-1].copy() cv2_gray = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2GRAY) segms = [] - for (x0, y0, x1, y1, conf) in results[0]: + bboxes = [] + if mmcv_legacy: + for (x0, y0, x1, y1, conf) in results[0]: + bboxes.append([x0, y0, x1, y1]) + else: + bboxes = output.bboxes + + for x0, y0, x1, y1 in bboxes: cv2_mask = np.zeros((cv2_gray.shape), np.uint8) cv2.rectangle(cv2_mask, (int(x0), int(y0)), (int(x1), int(y1)), 255, -1) cv2_mask_bool = cv2_mask.astype(bool) segms.append(cv2_mask_bool) - - n,m = results[0].shape + + if mmcv_legacy: + n,m = results[0].shape + else: + n,m = output.bboxes.shape if (n == 0): - return [[],[],[]] - bboxes = np.vstack(results[0]) - filter_inds = np.where(bboxes[:,-1] > conf_thres)[0] - results = [[],[],[]] + if mmcv_legacy: + return [[],[],[]] + else: + return [[],[],[],[]] + if mmcv_legacy: + bboxes = np.vstack(results[0]) + filter_inds = np.where(bboxes[:,-1] > conf_thres)[0] + results = [[],[],[]] + else: + bboxes = output.bboxes.numpy() + scores = output.scores.numpy() + filter_inds = np.where(scores > conf_thres)[0] + results = [[],[],[],[]] + for i in filter_inds: results[0].append(label) results[1].append(bboxes[i]) results[2].append(segms[i]) + if not mmcv_legacy: + results[3].append(scores[i]) return results From 4f9f3c14a715bbeb4f359621ab0355c7d42cb059 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Thu, 4 May 2023 12:20:14 +0900 Subject: [PATCH 04/11] add config files for mmdetection v3 --- config/coco_panoptic.py | 98 +++++++ ...k2former_r50_8xb2-lsj-50e_coco-panoptic.py | 265 ++++++++++++++++++ config/mmdet_anime-face_yolov3-v3.py | 150 ++++++++++ config/mmdet_dd-person_mask2former-v3.py | 105 +++++++ 4 files changed, 618 insertions(+) create mode 100644 config/coco_panoptic.py create mode 100644 config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py create mode 100644 config/mmdet_anime-face_yolov3-v3.py create mode 100644 config/mmdet_dd-person_mask2former-v3.py diff --git a/config/coco_panoptic.py b/config/coco_panoptic.py new file mode 100644 index 0000000..ea68126 --- /dev/null +++ b/config/coco_panoptic.py @@ -0,0 +1,98 @@ +# dataset settings +dataset_type = "CocoPanopticDataset" +# data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +data_root = "s3://openmmlab/datasets/detection/coco/" + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type="LoadImageFromFile", backend_args=backend_args), + dict(type="LoadPanopticAnnotations", backend_args=backend_args), + dict(type="Resize", scale=(1333, 800), keep_ratio=True), + dict(type="RandomFlip", prob=0.5), + dict(type="PackDetInputs"), +] +test_pipeline = [ + dict(type="LoadImageFromFile", backend_args=backend_args), + dict(type="Resize", scale=(1333, 800), keep_ratio=True), + dict(type="LoadPanopticAnnotations", backend_args=backend_args), + dict( + type="PackDetInputs", + meta_keys=("img_id", "img_path", "ori_shape", "img_shape", "scale_factor"), + ), +] + +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=True), + batch_sampler=dict(type="AspectRatioBatchSampler"), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file="annotations/panoptic_train2017.json", + data_prefix=dict(img="train2017/", seg="annotations/panoptic_train2017/"), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args, + ), +) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file="annotations/panoptic_val2017.json", + data_prefix=dict(img="val2017/", seg="annotations/panoptic_val2017/"), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args, + ), +) +test_dataloader = val_dataloader + +val_evaluator = dict( + type="CocoPanopticMetric", + ann_file=data_root + "annotations/panoptic_val2017.json", + seg_prefix=data_root + "annotations/panoptic_val2017/", + backend_args=backend_args, +) +test_evaluator = val_evaluator + +# inference on test dataset and +# format the output results for submission. +# test_dataloader = dict( +# batch_size=1, +# num_workers=1, +# persistent_workers=True, +# drop_last=False, +# sampler=dict(type='DefaultSampler', shuffle=False), +# dataset=dict( +# type=dataset_type, +# data_root=data_root, +# ann_file='annotations/panoptic_image_info_test-dev2017.json', +# data_prefix=dict(img='test2017/'), +# test_mode=True, +# pipeline=test_pipeline)) +# test_evaluator = dict( +# type='CocoPanopticMetric', +# format_only=True, +# ann_file=data_root + 'annotations/panoptic_image_info_test-dev2017.json', +# outfile_prefix='./work_dirs/coco_panoptic/test') diff --git a/config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py b/config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py new file mode 100644 index 0000000..b67d9b0 --- /dev/null +++ b/config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py @@ -0,0 +1,265 @@ +_base_ = ["./coco_panoptic.py"] +image_size = (1024, 1024) +batch_augments = [ + dict( + type="BatchFixedSizePad", + size=image_size, + img_pad_value=0, + pad_mask=True, + mask_pad_value=0, + pad_seg=True, + seg_pad_value=255, + ) +] +data_preprocessor = dict( + type="DetDataPreprocessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32, + pad_mask=True, + mask_pad_value=0, + pad_seg=True, + seg_pad_value=255, + batch_augments=batch_augments, +) + +num_things_classes = 1 +num_stuff_classes = 0 +num_classes = num_things_classes + num_stuff_classes +model = dict( + type="Mask2Former", + data_preprocessor=data_preprocessor, + backbone=dict( + type="ResNet", + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type="BN", requires_grad=False), + norm_eval=True, + style="pytorch", + init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet50"), + ), + panoptic_head=dict( + type="Mask2FormerHead", + in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + num_queries=100, + num_transformer_feat_level=3, + pixel_decoder=dict( + type="MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + dropout=0.0, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + ), + positional_encoding=dict(num_feats=128, normalize=True), + ), + enforce_decoder_input_project=False, + positional_encoding=dict(num_feats=128, normalize=True), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, num_heads=8, dropout=0.0, batch_first=True + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, num_heads=8, dropout=0.0, batch_first=True + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="CrossEntropyLoss", + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], + ), + loss_mask=dict( + type="CrossEntropyLoss", use_sigmoid=True, reduction="mean", loss_weight=5.0 + ), + loss_dice=dict( + type="DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + ), + panoptic_fusion_head=dict( + type="MaskFormerFusionHead", + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + loss_panoptic=None, + init_cfg=None, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="HungarianAssigner", + match_costs=[ + dict(type="ClassificationCost", weight=2.0), + dict(type="CrossEntropyLossCost", weight=5.0, use_sigmoid=True), + dict(type="DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="MaskPseudoSampler"), + ), + test_cfg=dict( + panoptic_on=True, + # For now, the dataset does not support + # evaluating semantic segmentation metric. + semantic_on=False, + instance_on=True, + # max_per_image is for instance segmentation. + max_per_image=100, + iou_thr=0.8, + # In Mask2Former's panoptic postprocessing, + # it will filter mask area where score is less than 0.5 . + filter_low_score=True, + ), + init_cfg=None, +) + +# dataset settings +data_root = "data/coco/" +train_pipeline = [ + dict( + type="LoadImageFromFile", to_float32=True, backend_args={{_base_.backend_args}} + ), + dict( + type="LoadPanopticAnnotations", + with_bbox=True, + with_mask=True, + with_seg=True, + backend_args={{_base_.backend_args}}, + ), + dict(type="RandomFlip", prob=0.5), + # large scale jittering + dict( + type="RandomResize", scale=image_size, ratio_range=(0.1, 2.0), keep_ratio=True + ), + dict( + type="RandomCrop", + crop_size=image_size, + crop_type="absolute", + recompute_bbox=True, + allow_negative_crop=True, + ), + dict(type="PackDetInputs"), +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +val_evaluator = [ + dict( + type="CocoPanopticMetric", + ann_file=data_root + "annotations/panoptic_val2017.json", + seg_prefix=data_root + "annotations/panoptic_val2017/", + backend_args={{_base_.backend_args}}, + ), + dict( + type="CocoMetric", + ann_file=data_root + "annotations/instances_val2017.json", + metric=["bbox", "segm"], + backend_args={{_base_.backend_args}}, + ), +] +test_evaluator = val_evaluator + +# optimizer +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + type="OptimWrapper", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "backbone": dict(lr_mult=0.1, decay_mult=1.0), + "query_embed": embed_multi, + "query_feat": embed_multi, + "level_embed": embed_multi, + }, + norm_decay_mult=0.0, + ), + clip_grad=dict(max_norm=0.01, norm_type=2), +) + +# learning policy +max_iters = 368750 +param_scheduler = dict( + type="MultiStepLR", + begin=0, + end=max_iters, + by_epoch=False, + milestones=[327778, 355092], + gamma=0.1, +) + +# Before 365001th iteration, we do evaluation every 5000 iterations. +# After 365000th iteration, we do evaluation every 368750 iterations, +# which means that we do evaluation at the end of training. +interval = 5000 +dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] +train_cfg = dict( + type="IterBasedTrainLoop", + max_iters=max_iters, + val_interval=interval, + dynamic_intervals=dynamic_intervals, +) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") + +default_hooks = dict( + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + save_last=True, + max_keep_ckpts=3, + interval=interval, + ) +) +log_processor = dict(type="LogProcessor", window_size=50, by_epoch=False) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/config/mmdet_anime-face_yolov3-v3.py b/config/mmdet_anime-face_yolov3-v3.py new file mode 100644 index 0000000..6499e9a --- /dev/null +++ b/config/mmdet_anime-face_yolov3-v3.py @@ -0,0 +1,150 @@ +# _base_ = ["../_base_/schedules/schedule_1x.py", "../_base_/default_runtime.py"] +# model settings +data_preprocessor = dict(type='DetDataPreprocessor', + mean=[0, 0, 0], + std=[255.0, 255.0, 255.0], + bgr_to_rgb=True, + pad_size_divisor=32) + +model = dict(type='YOLOV3', + data_preprocessor=data_preprocessor, + backbone=dict(type='Darknet', depth=53, out_indices=(3, 4, 5), + init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://darknet53')), + neck=dict(type='YOLOV3Neck', + num_scales=3, + in_channels=[1024, 512, 256], + out_channels=[512, 256, 128]), + bbox_head=dict(type='YOLOV3Head', + num_classes=1, + in_channels=[512, 256, 128], + out_channels=[1024, 512, 256], + anchor_generator=dict(type='YOLOAnchorGenerator', + base_sizes=[[(116, 90), + (156, 198), + (373, 326)], + [(30, 61), + (62, 45), + (59, 119)], + [(10, 13), + (16, 30), + (33, 23)]], + strides=[32, 16, 8]), + bbox_coder=dict(type='YOLOBBoxCoder'), + featmap_strides=[32, 16, 8], + loss_cls=dict(type='CrossEntropyLoss', + use_sigmoid=True, loss_weight=1.0, reduction='sum'), + loss_conf=dict(type='CrossEntropyLoss', + use_sigmoid=True, loss_weight=1.0, reduction='sum'), + loss_xy=dict(type='CrossEntropyLoss', + use_sigmoid=True, loss_weight=2.0, reduction='sum'), + loss_wh=dict(type='MSELoss', loss_weight=2.0, reduction='sum')), + + # training and testing settings + train_cfg=dict( + assigner=dict(type='GridAssigner', + pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0)), + test_cfg=dict(nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + conf_thr=0.005, + nms=dict(type='nms', iou_threshold=0.45), + max_per_img=100)) + +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Expand', + mean=data_preprocessor['mean'], + to_rgb=data_preprocessor['bgr_to_rgb'], + ratio_range=(1, 2)), + dict(type='MinIoURandomCrop', + min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9), + min_crop_size=0.3), + dict(type='RandomResize', scale=[(320, 320), (608, 608)], keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='PackDetInputs'), +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(608, 608), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')), +] + +train_dataloader=dict( + batch_size=8, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict(type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict(type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) + +test_dataloader = val_dataloader + +val_evaluator = dict(type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + backend_args=backend_args) +test_evaluator = val_evaluator + +train_cfg = dict(max_epochs=273, val_interval=7) + +# optimizer +optim_wrapper = dict(type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0005), + clip_grad=dict(max_norm=35, norm_type=2)) + +# learning policy +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=2000), + dict(type='MultiStepLR', by_epoch=True, milestones=[218, 246], gamma=0.1), +] + +default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=7)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr=dict(base_batch_size=64) diff --git a/config/mmdet_dd-person_mask2former-v3.py b/config/mmdet_dd-person_mask2former-v3.py new file mode 100644 index 0000000..375d45d --- /dev/null +++ b/config/mmdet_dd-person_mask2former-v3.py @@ -0,0 +1,105 @@ +_base_ = ['./mask2former_r50_8xb2-lsj-50e_coco-panoptic.py'] + +num_things_classes = 1 +num_stuff_classes = 0 +num_classes = num_things_classes + num_stuff_classes +image_size = (1024, 1024) +batch_augments = [ + dict( + type='BatchFixedSizePad', + size=image_size, + img_pad_value=0, + pad_mask=True, + mask_pad_value=0, + pad_seg=False, + ) +] +data_preprocessor = dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32, + pad_mask=True, + mask_pad_value=0, + pad_seg=False, + batch_augments=batch_augments, +) +model = dict( + data_preprocessor=data_preprocessor, + panoptic_head=dict( + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + loss_cls=dict(class_weight=[1.0] * num_classes + [0.1]), + ), + panoptic_fusion_head=dict( + num_things_classes=num_things_classes, num_stuff_classes=num_stuff_classes + ), + test_cfg=dict(panoptic_on=False), +) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True, backend_args=None), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='RandomFlip', prob=0.5), + # large scale jittering + dict( + type='RandomResize', + scale=image_size, + ratio_range=(0.1, 2.0), + resize_type='Resize', + keep_ratio=True, + ), + dict( + type='RandomCrop', + crop_size=image_size, + crop_type='absolute', + recompute_bbox=True, + allow_negative_crop=True, + ), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-5, 1e-5), by_mask=True), + dict(type='PackDetInputs'), +] + +test_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True, backend_args=None), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor'), + ), +] + +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +train_dataloader = dict( + dataset=dict( + type=dataset_type, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + ) +) +val_dataloader = dict( + dataset=dict( + type=dataset_type, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + pipeline=test_pipeline, + ) +) +test_dataloader = val_dataloader + +val_evaluator = dict( + _delete_=True, + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=None, +) +test_evaluator = val_evaluator From 131f182431b031764d27a2ac2973b87e0d78b811 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Thu, 4 May 2023 12:32:58 +0900 Subject: [PATCH 05/11] update for mmdet v3 --- scripts/ddetailer.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/ddetailer.py b/scripts/ddetailer.py index 1fdfa1a..6534c88 100644 --- a/scripts/ddetailer.py +++ b/scripts/ddetailer.py @@ -58,17 +58,23 @@ def startup(): load_file_from_url("https://huggingface.co/dustysys/ddetailer/resolve/main/mmdet/segm/mmdet_dd-person_mask2former.pth", segm_path) import torch + legacy = torch.__version__.split(".")[0] < "2" print("Check config files...") config_dir = os.path.join(scripts.basedir(), "config") - configs = [ "mmdet_anime-face_yolov3.py", "mmdet_dd-person_mask2former.py" ] + if legacy: + configs = [ "mmdet_anime-face_yolov3.py", "mmdet_dd-person_mask2former.py" ] + else: + configs = [ "mmdet_anime-face_yolov3-v3.py", "mmdet_dd-person_mask2former-v3.py", "mask2former_r50_8xb2-lsj-50e_coco-panoptic.py", "coco_panoptic.py" ] destdir = bbox_path for confpy in configs: conf = os.path.join(config_dir, confpy) + if not legacy: + confpy = confpy.replace("-v3.py", ".py") dest = os.path.join(destdir, confpy) if not os.path.exists(dest): print(f"Copy config file: {confpy}..") - shutil.copy(conf, destdir) + shutil.copy(conf, dest) destdir = segm_path print("Done") From ce29b9a93af6cb4e911563c5286df38f0ce842d3 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Thu, 4 May 2023 14:21:17 +0900 Subject: [PATCH 06/11] check the pytorch version and conditially install mmdet --- scripts/ddetailer.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/ddetailer.py b/scripts/ddetailer.py index 6534c88..64c94b5 100644 --- a/scripts/ddetailer.py +++ b/scripts/ddetailer.py @@ -44,11 +44,17 @@ def modeltitle(path, shorthash): def startup(): from launch import is_installed, run + import torch + legacy = torch.__version__.split(".")[0] < "2" if not is_installed("mmdet"): python = sys.executable run(f'"{python}" -m pip install -U openmim', desc="Installing openmim", errdesc="Couldn't install openmim") - run(f'"{python}" -m mim install mmcv-full', desc=f"Installing mmcv-full", errdesc=f"Couldn't install mmcv-full") - run(f'"{python}" -m pip install mmdet', desc=f"Installing mmdet", errdesc=f"Couldn't install mmdet") + if legacy: + run(f'"{python}" -m mim install mmcv-full', desc=f"Installing mmcv-full", errdesc=f"Couldn't install mmcv-full") + run(f'"{python}" -m pip install mmdet==2.28.2', desc=f"Installing mmdet", errdesc=f"Couldn't install mmdet") + else: + run(f'"{python}" -m mim install mmcv>==2.0.0', desc=f"Installing mmcv", errdesc=f"Couldn't install mmcv") + run(f'"{python}" -m pip install mmdet', desc=f"Installing mmdet", errdesc=f"Couldn't install mmdet") bbox_path = os.path.join(dd_models_path, "bbox") segm_path = os.path.join(dd_models_path, "segm") @@ -57,8 +63,6 @@ def startup(): load_file_from_url("https://huggingface.co/dustysys/ddetailer/resolve/main/mmdet/bbox/mmdet_anime-face_yolov3.pth", bbox_path) load_file_from_url("https://huggingface.co/dustysys/ddetailer/resolve/main/mmdet/segm/mmdet_dd-person_mask2former.pth", segm_path) - import torch - legacy = torch.__version__.split(".")[0] < "2" print("Check config files...") config_dir = os.path.join(scripts.basedir(), "config") if legacy: From afeb6aa4203d08bc5fb90bd6926e2f45053eb895 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Fri, 5 May 2023 00:04:34 +0900 Subject: [PATCH 07/11] specify the mmdet version --- scripts/ddetailer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ddetailer.py b/scripts/ddetailer.py index 64c94b5..b21b7d6 100644 --- a/scripts/ddetailer.py +++ b/scripts/ddetailer.py @@ -54,7 +54,7 @@ def startup(): run(f'"{python}" -m pip install mmdet==2.28.2', desc=f"Installing mmdet", errdesc=f"Couldn't install mmdet") else: run(f'"{python}" -m mim install mmcv>==2.0.0', desc=f"Installing mmcv", errdesc=f"Couldn't install mmcv") - run(f'"{python}" -m pip install mmdet', desc=f"Installing mmdet", errdesc=f"Couldn't install mmdet") + run(f'"{python}" -m pip install mmdet>=3', desc=f"Installing mmdet", errdesc=f"Couldn't install mmdet") bbox_path = os.path.join(dd_models_path, "bbox") segm_path = os.path.join(dd_models_path, "segm") From 754c0b3601d53a7f883aedecfd6fa4ebfbf339b2 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Fri, 5 May 2023 00:05:08 +0900 Subject: [PATCH 08/11] update num_things_classes=80 for the new model file --- config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py | 2 +- config/mmdet_dd-person_mask2former-v3.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py b/config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py index b67d9b0..882ef7b 100644 --- a/config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py +++ b/config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py @@ -24,7 +24,7 @@ batch_augments=batch_augments, ) -num_things_classes = 1 +num_things_classes = 80 num_stuff_classes = 0 num_classes = num_things_classes + num_stuff_classes model = dict( diff --git a/config/mmdet_dd-person_mask2former-v3.py b/config/mmdet_dd-person_mask2former-v3.py index 375d45d..ab1cfc3 100644 --- a/config/mmdet_dd-person_mask2former-v3.py +++ b/config/mmdet_dd-person_mask2former-v3.py @@ -1,6 +1,6 @@ _base_ = ['./mask2former_r50_8xb2-lsj-50e_coco-panoptic.py'] -num_things_classes = 1 +num_things_classes = 80 num_stuff_classes = 0 num_classes = num_things_classes + num_stuff_classes image_size = (1024, 1024) From 9238cf84d423eb1709db0a23f004d4e6d5f6f883 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Fri, 5 May 2023 00:12:52 +0900 Subject: [PATCH 09/11] download the segm model file from openmmlab.com for mmdet v3 Please see https://github.com/open-mmlab/mmdetection/tree/main/configs/mask2former#instance-segmentation for R-50 backbone --- scripts/ddetailer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/ddetailer.py b/scripts/ddetailer.py index b21b7d6..a58c736 100644 --- a/scripts/ddetailer.py +++ b/scripts/ddetailer.py @@ -61,7 +61,13 @@ def startup(): if (len(list_models(dd_models_path)) == 0): print("No detection models found, downloading...") load_file_from_url("https://huggingface.co/dustysys/ddetailer/resolve/main/mmdet/bbox/mmdet_anime-face_yolov3.pth", bbox_path) - load_file_from_url("https://huggingface.co/dustysys/ddetailer/resolve/main/mmdet/segm/mmdet_dd-person_mask2former.pth", segm_path) + if legacy: + load_file_from_url("https://huggingface.co/dustysys/ddetailer/resolve/main/mmdet/segm/mmdet_dd-person_mask2former.pth", segm_path) + else: + load_file_from_url( + "https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_r50_8xb2-lsj-50e_coco/mask2former_r50_8xb2-lsj-50e_coco_20220506_191028-41b088b6.pth", + segm_path, + file_name="mmdet_dd-person_mask2former.pth") print("Check config files...") config_dir = os.path.join(scripts.basedir(), "config") From b8ec582900a8558fe7d5613ebdef0152ab1ac351 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Fri, 28 Apr 2023 12:29:11 +0900 Subject: [PATCH 10/11] make postprocess() extension --- scripts/ddetailer.py | 254 +++++++++++++++++++++++++------------------ 1 file changed, 151 insertions(+), 103 deletions(-) diff --git a/scripts/ddetailer.py b/scripts/ddetailer.py index a58c736..107e7af 100644 --- a/scripts/ddetailer.py +++ b/scripts/ddetailer.py @@ -6,6 +6,7 @@ import gradio as gr import shutil +from copy import copy from modules import processing, images from modules import scripts, script_callbacks, shared, devices, modelloader from modules.processing import Processed, StableDiffusionProcessingTxt2Img, StableDiffusionProcessingImg2Img @@ -95,100 +96,132 @@ def gr_show(visible=True): return {"visible": visible, "__type__": "update"} class DetectionDetailerScript(scripts.Script): + def __init__(self): + super().__init__() + def title(self): return "Detection Detailer" def show(self, is_img2img): - return True + return scripts.AlwaysVisible def ui(self, is_img2img): import modules.ui - model_list = list_models(dd_models_path) - model_list.insert(0, "None") - if is_img2img: - info = gr.HTML("

Recommended settings: Use from inpaint tab, inpaint at full res ON, denoise <0.5

") - else: - info = gr.HTML("") - with gr.Group(): - with gr.Row(): - dd_model_a = gr.Dropdown(label="Primary detection model (A)", choices=model_list,value = "None", visible=True, type="value") - + with gr.Accordion("Detection Detailer", open=False): with gr.Row(): - dd_conf_a = gr.Slider(label='Detection confidence threshold % (A)', minimum=0, maximum=100, step=1, value=30, visible=False) - dd_dilation_factor_a = gr.Slider(label='Dilation factor (A)', minimum=0, maximum=255, step=1, value=4, visible=False) + enabled = gr.Checkbox(label="Enable", value=False, visible=True) - with gr.Row(): - dd_offset_x_a = gr.Slider(label='X offset (A)', minimum=-200, maximum=200, step=1, value=0, visible=False) - dd_offset_y_a = gr.Slider(label='Y offset (A)', minimum=-200, maximum=200, step=1, value=0, visible=False) - - with gr.Row(): - dd_preprocess_b = gr.Checkbox(label='Inpaint model B detections before model A runs', value=False, visible=False) - dd_bitwise_op = gr.Radio(label='Bitwise operation', choices=['None', 'A&B', 'A-B'], value="None", visible=False) - - br = gr.HTML("
") + model_list = list_models(dd_models_path) + model_list.insert(0, "None") + if is_img2img: + info = gr.HTML("

Recommended settings: Use from inpaint tab, inpaint at full res ON, denoise <0.5

") + else: + info = gr.HTML("") + with gr.Group(): + with gr.Row(): + dd_model_a = gr.Dropdown(label="Primary detection model (A)", choices=model_list,value = "None", visible=True, type="value") + + with gr.Row(): + dd_conf_a = gr.Slider(label='Detection confidence threshold % (A)', minimum=0, maximum=100, step=1, value=30, visible=False) + dd_dilation_factor_a = gr.Slider(label='Dilation factor (A)', minimum=0, maximum=255, step=1, value=4, visible=False) + + with gr.Row(): + dd_offset_x_a = gr.Slider(label='X offset (A)', minimum=-200, maximum=200, step=1, value=0, visible=False) + dd_offset_y_a = gr.Slider(label='Y offset (A)', minimum=-200, maximum=200, step=1, value=0, visible=False) + + with gr.Row(): + dd_preprocess_b = gr.Checkbox(label='Inpaint model B detections before model A runs', value=False, visible=False) + dd_bitwise_op = gr.Radio(label='Bitwise operation', choices=['None', 'A&B', 'A-B'], value="None", visible=False) + + br = gr.HTML("
") + + with gr.Group(): + with gr.Row(): + dd_model_b = gr.Dropdown(label="Secondary detection model (B) (optional)", choices=model_list,value = "None", visible =False, type="value") + + with gr.Row(): + dd_conf_b = gr.Slider(label='Detection confidence threshold % (B)', minimum=0, maximum=100, step=1, value=30, visible=False) + dd_dilation_factor_b = gr.Slider(label='Dilation factor (B)', minimum=0, maximum=255, step=1, value=4, visible=False) + + with gr.Row(): + dd_offset_x_b = gr.Slider(label='X offset (B)', minimum=-200, maximum=200, step=1, value=0, visible=False) + dd_offset_y_b = gr.Slider(label='Y offset (B)', minimum=-200, maximum=200, step=1, value=0, visible=False) + + with gr.Group(): + with gr.Row(): + dd_mask_blur = gr.Slider(label='Mask blur ', minimum=0, maximum=64, step=1, value=4, visible=(not is_img2img)) + dd_denoising_strength = gr.Slider(label='Denoising strength (Inpaint)', minimum=0.0, maximum=1.0, step=0.01, value=0.4, visible=(not is_img2img)) + + with gr.Row(): + dd_inpaint_full_res = gr.Checkbox(label='Inpaint at full resolution ', value=True, visible = (not is_img2img)) + dd_inpaint_full_res_padding = gr.Slider(label='Inpaint at full resolution padding, pixels ', minimum=0, maximum=256, step=4, value=32, visible=(not is_img2img)) + + dd_model_a.change( + lambda modelname: { + dd_model_b:gr_show( modelname != "None" ), + dd_conf_a:gr_show( modelname != "None" ), + dd_dilation_factor_a:gr_show( modelname != "None"), + dd_offset_x_a:gr_show( modelname != "None" ), + dd_offset_y_a:gr_show( modelname != "None" ) + + }, + inputs= [dd_model_a], + outputs =[dd_model_b, dd_conf_a, dd_dilation_factor_a, dd_offset_x_a, dd_offset_y_a] + ) + + dd_model_b.change( + lambda modelname: { + dd_preprocess_b:gr_show( modelname != "None" ), + dd_bitwise_op:gr_show( modelname != "None" ), + dd_conf_b:gr_show( modelname != "None" ), + dd_dilation_factor_b:gr_show( modelname != "None"), + dd_offset_x_b:gr_show( modelname != "None" ), + dd_offset_y_b:gr_show( modelname != "None" ) + }, + inputs= [dd_model_b], + outputs =[dd_preprocess_b, dd_bitwise_op, dd_conf_b, dd_dilation_factor_b, dd_offset_x_b, dd_offset_y_b] + ) + + return [info, enabled, + dd_model_a, + dd_conf_a, dd_dilation_factor_a, + dd_offset_x_a, dd_offset_y_a, + dd_preprocess_b, dd_bitwise_op, + br, + dd_model_b, + dd_conf_b, dd_dilation_factor_b, + dd_offset_x_b, dd_offset_y_b, + dd_mask_blur, dd_denoising_strength, + dd_inpaint_full_res, dd_inpaint_full_res_padding + ] + + def get_seed(self, p) -> tuple[int, int]: + i = p.iteration + + if not p.all_seeds: + seed = p.seed + elif i < len(p.all_seeds): + seed = p.all_seeds[i] + else: + j = i % len(p.all_seeds) + seed = p.all_seeds[j] - with gr.Group(): - with gr.Row(): - dd_model_b = gr.Dropdown(label="Secondary detection model (B) (optional)", choices=model_list,value = "None", visible =False, type="value") + if not p.all_subseeds: + subseed = p.subseed + elif i < len(p.all_subseeds): + subseed = p.all_subseeds[i] + else: + j = i % len(p.all_subseeds) + subseed = p.all_subseeds[j] - with gr.Row(): - dd_conf_b = gr.Slider(label='Detection confidence threshold % (B)', minimum=0, maximum=100, step=1, value=30, visible=False) - dd_dilation_factor_b = gr.Slider(label='Dilation factor (B)', minimum=0, maximum=255, step=1, value=4, visible=False) - - with gr.Row(): - dd_offset_x_b = gr.Slider(label='X offset (B)', minimum=-200, maximum=200, step=1, value=0, visible=False) - dd_offset_y_b = gr.Slider(label='Y offset (B)', minimum=-200, maximum=200, step=1, value=0, visible=False) - - with gr.Group(): - with gr.Row(): - dd_mask_blur = gr.Slider(label='Mask blur ', minimum=0, maximum=64, step=1, value=4, visible=(not is_img2img)) - dd_denoising_strength = gr.Slider(label='Denoising strength (Inpaint)', minimum=0.0, maximum=1.0, step=0.01, value=0.4, visible=(not is_img2img)) - - with gr.Row(): - dd_inpaint_full_res = gr.Checkbox(label='Inpaint at full resolution ', value=True, visible = (not is_img2img)) - dd_inpaint_full_res_padding = gr.Slider(label='Inpaint at full resolution padding, pixels ', minimum=0, maximum=256, step=4, value=32, visible=(not is_img2img)) - - dd_model_a.change( - lambda modelname: { - dd_model_b:gr_show( modelname != "None" ), - dd_conf_a:gr_show( modelname != "None" ), - dd_dilation_factor_a:gr_show( modelname != "None"), - dd_offset_x_a:gr_show( modelname != "None" ), - dd_offset_y_a:gr_show( modelname != "None" ) - - }, - inputs= [dd_model_a], - outputs =[dd_model_b, dd_conf_a, dd_dilation_factor_a, dd_offset_x_a, dd_offset_y_a] - ) - - dd_model_b.change( - lambda modelname: { - dd_preprocess_b:gr_show( modelname != "None" ), - dd_bitwise_op:gr_show( modelname != "None" ), - dd_conf_b:gr_show( modelname != "None" ), - dd_dilation_factor_b:gr_show( modelname != "None"), - dd_offset_x_b:gr_show( modelname != "None" ), - dd_offset_y_b:gr_show( modelname != "None" ) - }, - inputs= [dd_model_b], - outputs =[dd_preprocess_b, dd_bitwise_op, dd_conf_b, dd_dilation_factor_b, dd_offset_x_b, dd_offset_y_b] - ) - - return [info, - dd_model_a, - dd_conf_a, dd_dilation_factor_a, - dd_offset_x_a, dd_offset_y_a, - dd_preprocess_b, dd_bitwise_op, - br, - dd_model_b, - dd_conf_b, dd_dilation_factor_b, - dd_offset_x_b, dd_offset_y_b, - dd_mask_blur, dd_denoising_strength, - dd_inpaint_full_res, dd_inpaint_full_res_padding - ] + return seed, subseed + + def process(self, p, *args): + if getattr(p, "_disable_ddetailer", False): + return - def run(self, p, info, + def postprocess_image(self, p, pp, info, enabled, dd_model_a, dd_conf_a, dd_dilation_factor_a, dd_offset_x_a, dd_offset_y_a, @@ -200,21 +233,27 @@ def run(self, p, info, dd_mask_blur, dd_denoising_strength, dd_inpaint_full_res, dd_inpaint_full_res_padding): - processing.fix_seed(p) + if getattr(p, "_disable_ddetailer", False): + return + + if not enabled: + return + initial_info = None - seed = p.seed - p.batch_size = 1 - ddetail_count = p.n_iter - p.n_iter = 1 - p.do_not_save_grid = True - p.do_not_save_samples = True + seed, subseed = self.get_seed(p) + p.seed = seed + p.subseed = subseed + + info = "" + ddetail_count = 1 + is_txt2img = isinstance(p, StableDiffusionProcessingTxt2Img) + p_txt = copy(p) if (not is_txt2img): orig_image = p.init_images[0] else: - p_txt = p p = StableDiffusionProcessingImg2Img( - init_images = None, + init_images = [pp.image], resize_mode = 0, denoising_strength = dd_denoising_strength, mask = None, @@ -235,7 +274,8 @@ def run(self, p, info, seed_resize_from_h=p_txt.seed_resize_from_h, seed_resize_from_w=p_txt.seed_resize_from_w, sampler_name=p_txt.sampler_name, - n_iter=p_txt.n_iter, + batch_size=1, + n_iter=1, steps=p_txt.steps, cfg_scale=p_txt.cfg_scale, width=p_txt.width, @@ -244,16 +284,18 @@ def run(self, p, info, ) p.do_not_save_grid = True p.do_not_save_samples = True + + p._disable_ddetailer = True + output_images = [] state.job_count = ddetail_count for n in range(ddetail_count): devices.torch_gc() start_seed = seed + n if ( is_txt2img ): - print(f"Processing initial image for output generation {n + 1}.") - p_txt.seed = start_seed - processed = processing.process_images(p_txt) - init_image = processed.images[0] + print(f"Prepare initial image for output generation {p_txt.iteration + 1}.") + init_image = copy(pp.image) + info = processing.create_infotext(p_txt, p_txt.all_prompts, p_txt.all_seeds, p_txt.all_subseeds, None, 0, 0) else: init_image = orig_image @@ -276,7 +318,7 @@ def run(self, p, info, images.save_image(segmask_preview_b, opts.outdir_ddetailer_previews, "", start_seed, p.prompt, opts.samples_format, p=p) gen_count = len(masks_b_pre) state.job_count += gen_count - print(f"Processing {gen_count} model {label_b_pre} detections for output generation {n + 1}.") + print(f"Processing {gen_count} model {label_b_pre} detections for output generation {p_txt.iteration + 1}.") p.seed = start_seed p.init_images = [init_image] @@ -285,7 +327,12 @@ def run(self, p, info, if ( opts.dd_save_masks): images.save_image(masks_b_pre[i], opts.outdir_ddetailer_masks, "", start_seed, p.prompt, opts.samples_format, p=p) processed = processing.process_images(p) + + if not is_txt2img: + p.prompt = processed.all_prompts[0] + info = processed.info p.seed = processed.seed + 1 + p.subseed = processed.subseed + 1 p.init_images = processed.images if (gen_count > 0): @@ -293,7 +340,7 @@ def run(self, p, info, init_image = processed.images[0] else: - print(f"No model B detections for output generation {n} with current settings.") + print(f"No model B detections for output generation {p_txt.iteration + 1} with current settings.") # Primary run if (dd_model_a != "None"): @@ -335,7 +382,7 @@ def run(self, p, info, images.save_image(segmask_preview_a, opts.outdir_ddetailer_previews, "", start_seed, p.prompt, opts.samples_format, p=p) gen_count = len(masks_a) state.job_count += gen_count - print(f"Processing {gen_count} model {label_a} detections for output generation {n + 1}.") + print(f"Processing {gen_count} model {label_a} detections for output generation {p_txt.iteration + 1}.") p.seed = start_seed p.init_images = [init_image] @@ -347,21 +394,22 @@ def run(self, p, info, processed = processing.process_images(p) if initial_info is None: initial_info = processed.info + info = processed.info p.seed = processed.seed + 1 + p.subseed = processed.subseed + 1 p.init_images = processed.images if (gen_count > 0): output_images[n] = processed.images[0] - if ( opts.samples_save ): - images.save_image(processed.images[0], p.outpath_samples, "", start_seed, p.prompt, opts.samples_format, info=initial_info, p=p) else: - print(f"No model {label_a} detections for output generation {n} with current settings.") - state.job = f"Generation {n + 1} out of {state.job_count}" + print(f"No model {label_a} detections for output generation {p_txt.iteration + 1} with current settings.") + state.job = f"Generation {p_txt.iteration + 1} out of {state.job_count}" if (initial_info is None): initial_info = "No detections found." - return Processed(p, output_images, seed, initial_info) + if len(output_images) > 0: + pp.image = output_images[0] def modeldataset(model_shortname): path = modelpath(model_shortname) From d1a9108e3ca76be8b41965afd55f54b3fe3a992a Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Sat, 20 May 2023 20:37:22 +0900 Subject: [PATCH 11/11] no need to check is_txt2img. cleanup --- scripts/ddetailer.py | 82 +++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 46 deletions(-) diff --git a/scripts/ddetailer.py b/scripts/ddetailer.py index 107e7af..bf61cff 100644 --- a/scripts/ddetailer.py +++ b/scripts/ddetailer.py @@ -247,43 +247,40 @@ def postprocess_image(self, p, pp, info, enabled, info = "" ddetail_count = 1 - is_txt2img = isinstance(p, StableDiffusionProcessingTxt2Img) p_txt = copy(p) - if (not is_txt2img): - orig_image = p.init_images[0] - else: - p = StableDiffusionProcessingImg2Img( - init_images = [pp.image], - resize_mode = 0, - denoising_strength = dd_denoising_strength, - mask = None, - mask_blur= dd_mask_blur, - inpainting_fill = 1, - inpaint_full_res = dd_inpaint_full_res, - inpaint_full_res_padding= dd_inpaint_full_res_padding, - inpainting_mask_invert= 0, - sd_model=p_txt.sd_model, - outpath_samples=p_txt.outpath_samples, - outpath_grids=p_txt.outpath_grids, - prompt=p_txt.prompt, - negative_prompt=p_txt.negative_prompt, - styles=p_txt.styles, - seed=p_txt.seed, - subseed=p_txt.subseed, - subseed_strength=p_txt.subseed_strength, - seed_resize_from_h=p_txt.seed_resize_from_h, - seed_resize_from_w=p_txt.seed_resize_from_w, - sampler_name=p_txt.sampler_name, - batch_size=1, - n_iter=1, - steps=p_txt.steps, - cfg_scale=p_txt.cfg_scale, - width=p_txt.width, - height=p_txt.height, - tiling=p_txt.tiling, - ) - p.do_not_save_grid = True - p.do_not_save_samples = True + + p = StableDiffusionProcessingImg2Img( + init_images = [pp.image], + resize_mode = 0, + denoising_strength = dd_denoising_strength, + mask = None, + mask_blur= dd_mask_blur, + inpainting_fill = 1, + inpaint_full_res = dd_inpaint_full_res, + inpaint_full_res_padding= dd_inpaint_full_res_padding, + inpainting_mask_invert= 0, + sd_model=p_txt.sd_model, + outpath_samples=p_txt.outpath_samples, + outpath_grids=p_txt.outpath_grids, + prompt=p_txt.prompt, + negative_prompt=p_txt.negative_prompt, + styles=p_txt.styles, + seed=p_txt.seed, + subseed=p_txt.subseed, + subseed_strength=p_txt.subseed_strength, + seed_resize_from_h=p_txt.seed_resize_from_h, + seed_resize_from_w=p_txt.seed_resize_from_w, + sampler_name=p_txt.sampler_name, + batch_size=1, + n_iter=1, + steps=p_txt.steps, + cfg_scale=p_txt.cfg_scale, + width=p_txt.width, + height=p_txt.height, + tiling=p_txt.tiling, + ) + p.do_not_save_grid = True + p.do_not_save_samples = True p._disable_ddetailer = True @@ -292,13 +289,9 @@ def postprocess_image(self, p, pp, info, enabled, for n in range(ddetail_count): devices.torch_gc() start_seed = seed + n - if ( is_txt2img ): - print(f"Prepare initial image for output generation {p_txt.iteration + 1}.") - init_image = copy(pp.image) - info = processing.create_infotext(p_txt, p_txt.all_prompts, p_txt.all_seeds, p_txt.all_subseeds, None, 0, 0) - else: - init_image = orig_image - + init_image = copy(pp.image) + info = processing.create_infotext(p_txt, p_txt.all_prompts, p_txt.all_seeds, p_txt.all_subseeds, None, 0, 0) + output_images.append(init_image) masks_a = [] masks_b_pre = [] @@ -328,9 +321,6 @@ def postprocess_image(self, p, pp, info, enabled, images.save_image(masks_b_pre[i], opts.outdir_ddetailer_masks, "", start_seed, p.prompt, opts.samples_format, p=p) processed = processing.process_images(p) - if not is_txt2img: - p.prompt = processed.all_prompts[0] - info = processed.info p.seed = processed.seed + 1 p.subseed = processed.subseed + 1 p.init_images = processed.images