diff --git a/configs/deformable.py b/configs/deformable.py deleted file mode 100644 index 51e158b..0000000 --- a/configs/deformable.py +++ /dev/null @@ -1,154 +0,0 @@ -_base_ = [ - './thumos14.py' -] -custom_imports = dict(imports=['my_modules'], allow_failed_imports=False) - -# TadTR (based on DeFormableDETR) setting: (DINO, TadTR) -enc_layers = 4 # 6, 4 -dec_layers = 4 # 6, 4 -dim_feedforward = 1024 # 2048, 1024 -dropout = 0.1 # 0.0, 0.1 -temperature = 10000 # 20, 10000 - -act_loss_coef = 4 # NA, 4 -cls_loss_coef = 2 # 1.0, 2.0 -seg_loss_coef = 5 # 5.0, 5.0 -iou_loss_coef = 2 # 2.0, 2.0 - -max_per_img = 100 # 300, 100 -lr = 0.0002 # 1e-4, 2e-4 - -# model setting -model = dict( - type='DeformableDETR', - num_queries=40, # num_matching_queries, should be smaller than the window size - with_box_refine=True, - as_two_stage=True, - num_feature_levels=1, - data_preprocessor=dict(type='DetDataPreprocessor'), - backbone=dict(type='PseudoBackbone'), # No backbone since we use pre-extracted features. - neck=dict( - type='ChannelMapper', - in_channels=[2048], - kernel_size=1, - out_channels=256, - act_cfg=None, - norm_cfg=dict(type='GN', num_groups=32), - num_outs=1), - encoder=dict( - num_layers=enc_layers, # 6 for DeformableDETR - layer_cfg=dict( - self_attn_cfg=dict( - num_levels=1, - embed_dims=256, - batch_first=True), - ffn_cfg=dict( - embed_dims=256, - feedforward_channels=dim_feedforward, - ffn_drop=dropout))), - decoder=dict( - num_layers=dec_layers, # 6 for DeformableDETR - return_intermediate=True, - layer_cfg=dict( - self_attn_cfg=dict( - embed_dims=256, - num_heads=8, - dropout=dropout, - batch_first=True), - cross_attn_cfg=dict( - embed_dims=256, - num_levels=1, - batch_first=True), - ffn_cfg=dict( - embed_dims=256, - feedforward_channels=dim_feedforward, - ffn_drop=dropout)), - post_norm_cfg=None), - positional_encoding=dict(num_feats=128, normalize=True, offset=-0.5, temperature=temperature), - bbox_head=dict( - type='DeformableDETRHead', - num_classes=20, - sync_cls_avg_factor=True, - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=cls_loss_coef), - loss_bbox=dict(type='CustomL1Loss', loss_weight=seg_loss_coef), # customized to ignore y1, y2 - loss_iou=dict(type='CustomGIoULoss', loss_weight=iou_loss_coef)), # customized to ignore y1, y2 - # training and testing settings - train_cfg=dict( - assigner=dict( - type='HungarianAssigner', - match_costs=[ - dict(type='FocalLossCost', weight=2.0), - dict(type='CustomBBoxL1Cost', weight=5.0, box_format='xywh'), # customized to ignore y1, y2 - dict(type='CustomIoUCost', iou_mode='giou', weight=2.0) # customized to ignore y1, y2 - ])), - test_cfg=dict(max_per_img=max_per_img)) - -# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different -# from the default setting in mmdet. - -# optimizer -optim_wrapper = dict( - type='OptimWrapper', - optimizer=dict( - type='AdamW', - lr=lr, - weight_decay=0.0001), - clip_grad=dict(max_norm=0.1, norm_type=2), - paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1), - 'sampling_offsets': dict(lr_mult=0.1), - 'reference_points': dict(lr_mult=0.1) - })) - -# learning policy -# TadTR uses 30 epochs, but since we use random sliding windows rather than fixed overlapping windows, -# we should increase the number of epochs to maximize utilization of the video content. -max_epochs = 16 # 16 for TadTR -train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) # 1 for TadTR - -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -param_scheduler = [ - dict( - type='MultiStepLR', - begin=0, - end=max_epochs, - by_epoch=True, - milestones=[14], # 14 for TadTR - gamma=0.1) -] - -# NOTE: `auto_scale_lr` is for automatically scaling LR, -# USER SHOULD NOT CHANGE ITS VALUES. -# base_batch_size = (8 GPUs) x (2 samples per GPU) -auto_scale_lr = dict(base_batch_size=16) - -default_scope = 'mmdet' -default_hooks = dict( - timer=dict(type='IterTimerHook'), - logger=dict(type='LoggerHook', interval=10), - param_scheduler=dict(type='ParamSchedulerHook'), - checkpoint=dict(type='CheckpointHook', interval=10), - sampler_seed=dict(type='DistSamplerSeedHook'), - visualization=dict(type='DetVisualizationHook')) - -env_cfg = dict( - cudnn_benchmark=False, - mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), - dist_cfg=dict(backend='nccl'), -) - -vis_backends = [dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')] -visualizer = dict( - type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') -log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) - -log_level = 'INFO' -load_from = None -resume = False diff --git a/configs/dino.py b/configs/dino.py deleted file mode 100644 index 0cacc0b..0000000 --- a/configs/dino.py +++ /dev/null @@ -1,153 +0,0 @@ -_base_ = [ - './thumos14.py' -] -custom_imports = dict(imports=['my_modules'], allow_failed_imports=False) - -# TadTR (based on DeFormableDETR) setting: (DINO, TadTR) -enc_layers = 4 # 6, 4 -dec_layers = 4 # 6, 4 -dim_feedforward = 1024 # 2048, 1024 -dropout = 0.1 # 0.0, 0.1 -temperature = 10000 # 20, 10000 - -act_loss_coef = 4 # NA, 4 -cls_loss_coef = 2 # 1.0, 2.0 -seg_loss_coef = 5 # 5.0, 5.0 -iou_loss_coef = 2 # 2.0, 2.0 - -max_per_img = 100 # 300, 100 -lr = 0.0002 # 1e-4, 2e-4 - -# model setting -model = dict( - type='DINO', - num_queries=40, # num_matching_queries, should be smaller than the window size - with_box_refine=True, - as_two_stage=True, - data_preprocessor=dict(type='DetDataPreprocessor'), - backbone=dict(type='PseudoBackbone'), # No backbone since we use pre-extracted features. - neck=dict( - type='ChannelMapper', - in_channels=[2048], - kernel_size=1, - out_channels=256, - act_cfg=None, - norm_cfg=dict(type='GN', num_groups=32), - num_outs=1), - encoder=dict( - num_layers=enc_layers, - layer_cfg=dict( - self_attn_cfg=dict(embed_dims=256, num_levels=1, - dropout=dropout), # 0.1 for DeformDETR - ffn_cfg=dict( - embed_dims=256, - feedforward_channels=dim_feedforward, # 1024 for DeformDETR - ffn_drop=dropout))), # 0.1 for DeformDETR - decoder=dict( - num_layers=dec_layers, - return_intermediate=True, - layer_cfg=dict( - self_attn_cfg=dict(embed_dims=256, num_heads=8, - dropout=dropout), # 0.1 for DeformDETR - cross_attn_cfg=dict(embed_dims=256, num_levels=1, - dropout=dropout), # 0.1 for DeformDETR - ffn_cfg=dict( - embed_dims=256, - feedforward_channels=dim_feedforward, # 1024 for DeformDETR - ffn_drop=dropout)), # 0.1 for DeformDETR - post_norm_cfg=None), - positional_encoding=dict( - num_feats=128, - normalize=True, - offset=0.0, # -0.5 for DeformDETR - temperature=temperature), # 10000 for DeformDETR - bbox_head=dict( - type='CustomDINOHead', - num_classes=20, - sync_cls_avg_factor=True, - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=cls_loss_coef), # 2.0 in DeformDETR - loss_bbox=dict(type='L1Loss', loss_weight=seg_loss_coef), - loss_iou=dict(type='GIoULoss', loss_weight=iou_loss_coef)), - dn_cfg=dict( # TODO: Move to model.train_cfg ? - label_noise_scale=0.5, - box_noise_scale=0.4, # 0.4 for DN-DETR - group_cfg=dict(dynamic=True, num_groups=None, - num_dn_queries=50)), # TODO: half num_dn_queries - # training and testing settings - train_cfg=dict( - assigner=dict( - type='HungarianAssigner', - match_costs=[ - dict(type='FocalLossCost', weight=2.0), - dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), - dict(type='IoUCost', iou_mode='giou', weight=2.0) - ])), - test_cfg=dict(max_per_img=max_per_img)) # 100 for DeformDETR - -# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different -# from the default setting in mmdet. - -# optimizer -optim_wrapper = dict( - type='OptimWrapper', - optimizer=dict( - type='AdamW', - lr=lr, # 0.0002 for DeformDETR - weight_decay=0.0001), - clip_grad=dict(max_norm=0.1, norm_type=2), - paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)}) -) # custom_keys contains sampling_offsets and reference_points in DeformDETR # noqa - -# learning policy -# TadTR uses 30 epochs, but since we use random sliding windows rather than fixed overlapping windows, -# we should increase the number of epochs to maximize utilization of the video content. -max_epochs = 80 -train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=5) - -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -param_scheduler = [ - dict( - type='MultiStepLR', - begin=0, - end=max_epochs, - by_epoch=True, - milestones=[70], - gamma=0.1) -] - -# NOTE: `auto_scale_lr` is for automatically scaling LR, -# USER SHOULD NOT CHANGE ITS VALUES. -# base_batch_size = (8 GPUs) x (2 samples per GPU) -auto_scale_lr = dict(base_batch_size=16) - -default_scope = 'mmdet' -default_hooks = dict( - timer=dict(type='IterTimerHook'), - logger=dict(type='LoggerHook', interval=10), - param_scheduler=dict(type='ParamSchedulerHook'), - checkpoint=dict(type='CheckpointHook', interval=10), - sampler_seed=dict(type='DistSamplerSeedHook'), - visualization=dict(type='DetVisualizationHook')) - -env_cfg = dict( - cudnn_benchmark=False, - mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), - dist_cfg=dict(backend='nccl'), -) - -vis_backends = [dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')] -visualizer = dict( - type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') -log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) - -log_level = 'INFO' -load_from = None -resume = False diff --git a/configs/dino_act.py b/configs/dino_act.py deleted file mode 100644 index a86f258..0000000 --- a/configs/dino_act.py +++ /dev/null @@ -1,169 +0,0 @@ -_base_ = [ - './thumos14.py' -] -custom_imports = dict(imports=['my_modules'], allow_failed_imports=False) - -# TadTR (based on DeFormableDETR) setting: (DINO, TadTR) -enc_layers = 4 # 6, 4 -dec_layers = 4 # 6, 4 -dim_feedforward = 1024 # 2048, 1024 -dropout = 0.1 # 0.0, 0.1 -temperature = 10000 # 20, 10000 - -act_loss_coef = 4 # NA, 4 -cls_loss_coef = 2 # 1.0, 2.0 -seg_loss_coef = 5 # 5.0, 5.0 -iou_loss_coef = 2 # 2.0, 2.0 - -max_per_img = 100 # 300, 100 -lr = 0.0002 # 1e-4, 2e-4 - -# model setting -model = dict( - type='CustomDINO', - num_queries=40, # num_matching_queries, should be smaller than the window size - with_box_refine=True, - as_two_stage=True, - data_preprocessor=dict(type='DetDataPreprocessor'), - backbone=dict(type='PseudoBackbone'), # No backbone since we use pre-extracted features. - neck=dict( - type='ChannelMapper', - in_channels=[2048], - kernel_size=1, - out_channels=256, - act_cfg=None, - norm_cfg=dict(type='GN', num_groups=32), - num_outs=1), - encoder=dict( - num_layers=enc_layers, - layer_cfg=dict( - self_attn_cfg=dict(embed_dims=256, num_levels=1, - dropout=dropout), # 0.1 for DeformDETR - ffn_cfg=dict( - embed_dims=256, - feedforward_channels=dim_feedforward, # 1024 for DeformDETR - ffn_drop=dropout))), # 0.1 for DeformDETR - decoder=dict( - num_layers=dec_layers, - return_intermediate=True, - layer_cfg=dict( - self_attn_cfg=dict(embed_dims=256, num_heads=8, - dropout=dropout), # 0.1 for DeformDETR - cross_attn_cfg=dict(embed_dims=256, num_levels=1, - dropout=dropout), # 0.1 for DeformDETR - ffn_cfg=dict( - embed_dims=256, - feedforward_channels=dim_feedforward, # 1024 for DeformDETR - ffn_drop=dropout)), # 0.1 for DeformDETR - post_norm_cfg=None), - positional_encoding=dict( - num_feats=128, - normalize=True, - offset=0.0, # -0.5 for DeformDETR - temperature=temperature), # 10000 for DeformDETR - bbox_head=dict( - type='MyRoIHead', - bbox_roi_extractor=dict( - type='GenericRoIExtractor', - aggregation='sum', - roi_layer=dict( - type='RoIAlign', - output_size=(1, 16), - sampling_ratio=0, - aligned=True), - out_channels=256, - featmap_strides=[1], - pre_cfg=None, - post_cfg=None), - expand_roi_factor=1.5, - actionness_loss=dict(type='L1Loss', loss_weight=act_loss_coef), - bbox_head=dict( - type='CustomDINOHead', - num_classes=20, - sync_cls_avg_factor=True, - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=cls_loss_coef), # 2.0 in DeformDETR - loss_bbox=dict(type='L1Loss', loss_weight=seg_loss_coef), - loss_iou=dict(type='GIoULoss', loss_weight=iou_loss_coef))), - dn_cfg=dict( # TODO: Move to model.train_cfg ? - label_noise_scale=0.5, - box_noise_scale=0.4, # 0.4 for DN-DETR - group_cfg=dict(dynamic=True, num_groups=None, - num_dn_queries=50)), # TODO: half num_dn_queries - # training and testing settings - train_cfg=dict( - assigner=dict( - type='HungarianAssigner', - match_costs=[ - dict(type='FocalLossCost', weight=2.0), - dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), - dict(type='IoUCost', iou_mode='giou', weight=2.0) - ])), - test_cfg=dict(max_per_img=max_per_img)) # 100 for DeformDETR - -# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different -# from the default setting in mmdet. - -# optimizer -optim_wrapper = dict( - type='OptimWrapper', - optimizer=dict( - type='AdamW', - lr=lr, # 0.0002 for DeformDETR - weight_decay=0.0001), - clip_grad=dict(max_norm=0.1, norm_type=2), - paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)}) -) # custom_keys contains sampling_offsets and reference_points in DeformDETR # noqa - -# learning policy -# TadTR uses 30 epochs, but since we use random sliding windows rather than fixed overlapping windows, -# we should increase the number of epochs to maximize utilization of the video content. -max_epochs = 80 -train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=5) - -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -param_scheduler = [ - dict( - type='MultiStepLR', - begin=0, - end=max_epochs, - by_epoch=True, - milestones=[70], - gamma=0.1) -] - -# NOTE: `auto_scale_lr` is for automatically scaling LR, -# USER SHOULD NOT CHANGE ITS VALUES. -# base_batch_size = (8 GPUs) x (2 samples per GPU) -auto_scale_lr = dict(base_batch_size=16) - -default_scope = 'mmdet' -default_hooks = dict( - timer=dict(type='IterTimerHook'), - logger=dict(type='LoggerHook', interval=10), - param_scheduler=dict(type='ParamSchedulerHook'), - checkpoint=dict(type='CheckpointHook', interval=10), - sampler_seed=dict(type='DistSamplerSeedHook'), - visualization=dict(type='DetVisualizationHook')) - -env_cfg = dict( - cudnn_benchmark=False, - mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), - dist_cfg=dict(backend='nccl'), -) - -vis_backends = [dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')] -visualizer = dict( - type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') -log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) - -log_level = 'INFO' -load_from = None -resume = False diff --git a/configs/tadtr.py b/configs/tadtr.py index 9f22685..fc85bae 100644 --- a/configs/tadtr.py +++ b/configs/tadtr.py @@ -1,3 +1,4 @@ +# I an _base_ = [ './thumos14.py' ] @@ -68,33 +69,17 @@ # num_feats=128, offset=-0.5 for DeformableDETR, but 256, 0 for TadTR, we cannot set to 256 because of y-axis positional_encoding=dict(num_feats=128, normalize=True, offset=0, temperature=temperature), bbox_head=dict( - type='MyRoIHead', - bbox_roi_extractor=dict( - type='GenericRoIExtractor', - aggregation='sum', - roi_layer=dict( - type='RoIAlign', - output_size=(1, 16), - sampling_ratio=0, - aligned=True), - out_channels=256, - featmap_strides=[1], - pre_cfg=None, - post_cfg=None), - expand_roi_factor=1.5, - actionness_loss=dict(type='L1Loss', loss_weight=act_loss_coef), - bbox_head=dict( - type='CustomDeformableDETRHead', - num_classes=20, - sync_cls_avg_factor=True, - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=cls_loss_coef), - loss_bbox=dict(type='CustomL1Loss', loss_weight=seg_loss_coef), - loss_iou=dict(type='CustomIoULoss', mode='linear', loss_weight=iou_loss_coef))), # -log(GIoU) for DeformableDETR + type='CustomDeformableDETRHead', + num_classes=20, + sync_cls_avg_factor=True, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=cls_loss_coef), + loss_bbox=dict(type='CustomL1Loss', loss_weight=seg_loss_coef), + loss_iou=dict(type='CustomIoULoss', mode='linear', loss_weight=iou_loss_coef)), # -log(GIoU) for DeformableDETR # training and testing settings train_cfg=dict( assigner=dict( @@ -115,7 +100,7 @@ optimizer=dict( type='AdamW', lr=lr, - weight_decay=0.0001), # 0.0001 by default + weight_decay=0.0001), # 0.0001 by default clip_grad=dict(max_norm=0.1, norm_type=2), paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1), 'sampling_offsets': dict(lr_mult=0.1), diff --git a/configs/tadtr_my.py b/configs/tadtr_my.py index b0d4bdf..8fee421 100644 --- a/configs/tadtr_my.py +++ b/configs/tadtr_my.py @@ -36,10 +36,11 @@ window_stride=192, # overlap=0.25 data_prefix=dict(feat='features/thumos_feat_ActionFormer_16input_4stride_2048/i3d_features'))) -# 3. Use multi-scale features via multi-level temporal 1d convolutions +# 3. Use multi-level features via temporal 1d convolution layers +# model setting model = dict( num_feature_levels=4, - backbone=dict(type='PseudoBackbone', multi_scale=False), + backbone=dict(type='PseudoBackbone', multi_scale=False), # No backbone since we use pre-extracted features. neck=[ dict( type='DownSampler1D', @@ -55,14 +56,7 @@ out_channels=256, act_cfg=None, norm_cfg=dict(type='GN', num_groups=32), - num_outs=4) - # dict(type='FPN', - # in_channels=[2048, 512, 512, 512, 512, 512], - # out_channels=256, - # num_outs=6, - # conv_cfg=dict(type='Conv1d'), - # norm_cfg=dict(type='SyncBN')), - ], + num_outs=4)], encoder=dict(layer_cfg=dict(self_attn_cfg=dict(num_levels=4))), decoder=dict(layer_cfg=dict(cross_attn_cfg=dict(num_levels=4))) ) diff --git a/my_modules/detector/__init__.py b/my_modules/detector/__init__.py index 945a172..28964f0 100644 --- a/my_modules/detector/__init__.py +++ b/my_modules/detector/__init__.py @@ -1,5 +1,2 @@ -# from .base_detr import * -# from .deformable_detr import * -from .dino import * from .deformable_detr import * from .pseudo_backbone import * diff --git a/my_modules/detector/deformable_detr.py b/my_modules/detector/deformable_detr.py index 3b47eeb..2873935 100644 --- a/my_modules/detector/deformable_detr.py +++ b/my_modules/detector/deformable_detr.py @@ -1,11 +1,5 @@ -from typing import Dict, Tuple - -import torch from mmdet.models.detectors import DeformableDETR from mmdet.registry import MODELS -from mmdet.structures import OptSampleList -from torch import Tensor -from torch.nn import functional as F from my_modules.layers.pseudo_layers import Pseudo2DLinear from my_modules.loss.positional_encoding import PositionEmbeddingSine @@ -31,26 +25,3 @@ def _init_layers(self) -> None: **pos_cfg) if not self.as_two_stage: self.reference_points_fc = Pseudo2DLinear(self.embed_dims, 1) - - - def forward_transformer(self, - img_feats: Tuple[Tensor], - batch_data_samples: OptSampleList = None) -> Dict: - encoder_inputs_dict, decoder_inputs_dict = self.pre_transformer( - img_feats, batch_data_samples) - - encoder_outputs_dict = self.forward_encoder(**encoder_inputs_dict) - - tmp_dec_in, head_inputs_dict = self.pre_decoder(**encoder_outputs_dict) - decoder_inputs_dict.update(tmp_dec_in) - - decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict) - head_inputs_dict.update(decoder_outputs_dict) - - # louis: input encoder memory into2 the head for RoIAlign and actionness regression. - memory = encoder_outputs_dict['memory'] - level_start_index = decoder_inputs_dict['level_start_index'].cpu() - # memory [N, W, C] -> [N, C, W] -> [N, C, 1, W] -> split on last dimension (W) - mlvl_memory = torch.tensor_split(memory.transpose(1, 2).unsqueeze(2), level_start_index[1:], dim=-1) - head_inputs_dict['memory'] = mlvl_memory - return head_inputs_dict diff --git a/my_modules/detector/dino.py b/my_modules/detector/dino.py deleted file mode 100644 index 18c0748..0000000 --- a/my_modules/detector/dino.py +++ /dev/null @@ -1,66 +0,0 @@ -from typing import Dict, Tuple - -import torch -from mmdet.models.detectors import DINO -from mmdet.registry import MODELS -from mmdet.structures import OptSampleList -from torch import Tensor - - -@MODELS.register_module() -class CustomDINO(DINO): - """ - The Customized DINO that input memory (encoder output) into the head. - This customized DINO output memory into the head for the RoI purpose, - without any functional changes from the original DINO. - """ - - def forward_transformer( - self, - img_feats: Tuple[Tensor], - batch_data_samples: OptSampleList = None, - ) -> Dict: - """Forward process of Transformer. - - The forward procedure of the transformer is defined as: - 'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder' - More details can be found at `TransformerDetector.forward_transformer` - in `mmdet/detector/base_detr.py`. - The difference is that the ground truth in `batch_data_samples` is - required for the `pre_decoder` to prepare the query of DINO. - Additionally, DINO inherits the `pre_transformer` method and the - `forward_encoder` method of DeformableDETR. More details about the - two methods can be found in `mmdet/detector/deformable_detr.py`. - - Args: - img_feats (tuple[Tensor]): Tuple of feature maps from neck. Each - feature map has shape (bs, dim, H, W). - batch_data_samples (list[:obj:`DetDataSample`]): The batch - data samples. It usually includes information such - as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. - Defaults to None. - - Returns: - dict: The dictionary of bbox_head function inputs, which always - includes the `hidden_states` of the decoder output and may contain - `references` including the initial and intermediate references. - """ - encoder_inputs_dict, decoder_inputs_dict = self.pre_transformer( - img_feats, batch_data_samples) - - encoder_outputs_dict = self.forward_encoder(**encoder_inputs_dict) - - tmp_dec_in, head_inputs_dict = self.pre_decoder( - **encoder_outputs_dict, batch_data_samples=batch_data_samples) - decoder_inputs_dict.update(tmp_dec_in) - - decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict) - head_inputs_dict.update(decoder_outputs_dict) - - # louis: input encoder memory into the head for RoIAlign and actionness regression. - memory = encoder_outputs_dict['memory'] - level_start_index = decoder_inputs_dict['level_start_index'].cpu() - # memory [N, W, C] -> [N, C, W] -> [N, C, 1, W] -> split on last dimension (W) - mlvl_memory = torch.tensor_split(memory.transpose(1, 2).unsqueeze(2), level_start_index[1:], dim=-1) - head_inputs_dict['memory'] = mlvl_memory - return head_inputs_dict diff --git a/my_modules/head/__init__.py b/my_modules/head/__init__.py index 4461f3a..6c96a0c 100644 --- a/my_modules/head/__init__.py +++ b/my_modules/head/__init__.py @@ -2,4 +2,3 @@ # from .detr_head import * from .dino_head import * from .deformable_detr_head import * -from .roi_head import * diff --git a/my_modules/head/deformable_detr_head.py b/my_modules/head/deformable_detr_head.py index 5209694..9bda616 100644 --- a/my_modules/head/deformable_detr_head.py +++ b/my_modules/head/deformable_detr_head.py @@ -1,10 +1,6 @@ -from typing import List, Tuple - from mmdet.models.dense_heads import DeformableDETRHead from mmdet.registry import MODELS -from mmdet.structures import SampleList -from mmdet.utils import InstanceList -from torch import nn, Tensor +from torch import nn from my_modules.layers.pseudo_layers import Pseudo4DLinear @@ -29,24 +25,3 @@ def init_weights(self) -> None: if self.as_two_stage: for m in self.reg_branches: nn.init.constant_(m[-1].bias.data[1:], 0.0) # [2:] -> [1:] - - def loss_and_predict( - self, - hidden_states: Tensor, references: List[Tensor], - enc_outputs_class: Tensor, enc_outputs_coord: Tensor, - batch_data_samples: SampleList, - rescale: bool = False) -> Tuple[dict, InstanceList]: - batch_gt_instances = [] - batch_img_metas = [] - for data_sample in batch_data_samples: - batch_img_metas.append(data_sample.metainfo) - batch_gt_instances.append(data_sample.gt_instances) - - outs = self(hidden_states, references) - loss_inputs = outs + (enc_outputs_class, enc_outputs_coord, - batch_gt_instances, batch_img_metas) - losses = self.loss_by_feat(*loss_inputs) - - predictions = self.predict_by_feat( - *outs, batch_img_metas=batch_img_metas, rescale=rescale) - return losses, predictions diff --git a/my_modules/head/roi_head.py b/my_modules/head/roi_head.py deleted file mode 100644 index 2d283ab..0000000 --- a/my_modules/head/roi_head.py +++ /dev/null @@ -1,136 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import warnings -from typing import List - -import torch -from mmdet.registry import MODELS -from mmdet.structures import DetDataSample -from mmdet.structures.bbox import bbox2roi, bbox_overlaps -from mmdet.utils import ConfigType, InstanceList, OptMultiConfig -from mmengine.model import BaseModule -from torch import nn - - -@MODELS.register_module() -class MyRoIHead(BaseModule): - """Simplest base roi head including one bbox head and one mask head.""" - - def __init__(self, - bbox_roi_extractor: ConfigType, - actionness_loss: ConfigType, - bbox_head: ConfigType, - init_cfg: OptMultiConfig = None, - expand_roi_factor=1.5, - active=True, # experimental arguments, set to False can deactivate the RoI - **kwargs) -> None: - super().__init__(init_cfg=init_cfg) - self.active = active - if active: - self.bbox_roi_extractor = MODELS.build(bbox_roi_extractor) - self.roi_size = bbox_roi_extractor['roi_layer']['output_size'][1] - self.dim = bbox_roi_extractor['out_channels'] - self.expand_roi_factor = expand_roi_factor - self.actionness_fc = nn.Sequential( - nn.Flatten(start_dim=1), - nn.Linear(self.roi_size * self.dim, self.dim), - nn.ReLU(inplace=True), - nn.Linear(self.dim, self.dim), - nn.ReLU(inplace=True), - nn.Linear(self.dim, 1), - nn.Sigmoid()) - self.actionness_loss = MODELS.build(actionness_loss) - else: - warnings.warn("Please note that the RoIHead is now deactivated, no RoI will be applied") - - bbox_head.update(kwargs) - self.bbox_head = MODELS.build(bbox_head) - - @property - def num_classes(self): - return self.bbox_head.num_classes - - @property - def reg_branches(self): - return self.bbox_head.reg_branches - - @property - def cls_branches(self): - return self.bbox_head.cls_branches - - def forward(self, *args, **kwargs) -> tuple: - return self.bbox_head(*args, **kwargs) - - def loss(self, batch_data_samples: List[DetDataSample], **head_inputs_dict) -> dict: - memory = head_inputs_dict.pop('memory') - - if self.active: - bbox_head_loss, bbox_pred = self.bbox_head.loss_and_predict(batch_data_samples=batch_data_samples, - rescale=False, - **head_inputs_dict) - actionness_pred = self.actionness_forward(memory, bbox_pred, batch_data_samples) - actionness_target = self.get_actionness_target(bbox_pred, batch_data_samples) - actionness_loss = self.actionness_loss(actionness_pred.reshape(-1), - torch.cat(actionness_target, 0)) - bbox_head_loss.update(dict(actionness_loss=actionness_loss)) - else: - bbox_head_loss = self.bbox_head.loss(batch_data_samples=batch_data_samples, - **head_inputs_dict) - - return bbox_head_loss - - def predict(self, batch_data_samples: List[DetDataSample], rescale, **head_inputs_dict) -> InstanceList: - memory = head_inputs_dict.pop('memory') - bbox_pred = self.bbox_head.predict(batch_data_samples=batch_data_samples, - rescale=False, - **head_inputs_dict) - if self.active: - actionness_pred = self.actionness_forward(memory, bbox_pred, batch_data_samples).reshape(len(bbox_pred), -1) - else: - actionness_pred = [pred.scores for pred in bbox_pred] - return self.post_process(bbox_pred, actionness_pred, batch_data_samples, rescale) - - def actionness_forward(self, memory, bbox_pred, batch_data_samples): - # Expand the range of (x1, x2) - ex_bbox_pred = [res.bboxes.clone().detach() for res in bbox_pred] - for bboxes, data_sample in zip(ex_bbox_pred, batch_data_samples): - max_len = data_sample.metainfo['img_shape'][1] - length = bboxes[:, 2] - bboxes[:, 0] - center = (bboxes[:, 2] + bboxes[:, 0]) / 2 - bboxes[:, 0] = (center - length * self.expand_roi_factor / 2).clamp(min=0, max=max_len) - bboxes[:, 2] = (center + length * self.expand_roi_factor / 2).clamp(min=0, max=max_len) - - # actionness regression prediction - rois = bbox2roi(ex_bbox_pred).detach() - bbox_feats = self.bbox_roi_extractor(memory[:self.bbox_roi_extractor.num_inputs], rois) - actionness_pred = self.actionness_fc(bbox_feats) - return actionness_pred - - @staticmethod - def get_actionness_target(bbox_pred, batch_data_samples): - batch_bboxes = [res.bboxes for res in bbox_pred] - batch_gt_bboxes = [data_sample.gt_instances.bboxes for data_sample in batch_data_samples] - - # Fix the y1 y2 - for res in bbox_pred: - res.bboxes[:, 1] = 0.1 - res.bboxes[:, 3] = 0.9 - - actionness_target = [] - for bboxes, gt_bboxes in zip(batch_bboxes, batch_gt_bboxes): - iou_mat = bbox_overlaps(bboxes, gt_bboxes, mode='iou', is_aligned=False) - gt_iou = iou_mat.max(dim=1)[0] - actionness_target.append(gt_iou.detach()) - - return actionness_target - - @staticmethod - def post_process(bbox_pred, actionness_pred, batch_data_samples, rescale): - for pred, data_sample, actionness in zip(bbox_pred, batch_data_samples, actionness_pred): - img_meta = data_sample.metainfo - if rescale: - assert img_meta.get('scale_factor') is not None - pred.bboxes /= pred.bboxes.new_tensor( - img_meta['scale_factor']).repeat((1, 2)) - # using actionness regression results as confidence scores of bboxes instead of classification score - pred.scores = torch.sqrt(pred.scores * actionness) - return bbox_pred diff --git a/my_modules/layers/__init__.py b/my_modules/layers/__init__.py index 3d6a90d..cb911c3 100644 --- a/my_modules/layers/__init__.py +++ b/my_modules/layers/__init__.py @@ -1,3 +1,2 @@ from .custom_layers import * -from .channel_mapper import * from .pseudo_layers import * diff --git a/my_modules/neck/__init__.py b/my_modules/neck/__init__.py index 709bada..80085d3 100644 --- a/my_modules/neck/__init__.py +++ b/my_modules/neck/__init__.py @@ -1 +1,2 @@ from .temporal_downsampler import * +from .channel_mapper import * diff --git a/my_modules/layers/channel_mapper.py b/my_modules/neck/channel_mapper.py similarity index 100% rename from my_modules/layers/channel_mapper.py rename to my_modules/neck/channel_mapper.py