diff --git a/configs/deformable.py b/configs/deformable.py
deleted file mode 100644
index 51e158b..0000000
--- a/configs/deformable.py
+++ /dev/null
@@ -1,154 +0,0 @@
-_base_ = [
-    './thumos14.py'
-]
-custom_imports = dict(imports=['my_modules'], allow_failed_imports=False)
-
-# TadTR (based on DeFormableDETR) setting: (DINO, TadTR)
-enc_layers = 4  # 6, 4
-dec_layers = 4  # 6, 4
-dim_feedforward = 1024  # 2048, 1024
-dropout = 0.1  # 0.0, 0.1
-temperature = 10000  # 20, 10000
-
-act_loss_coef = 4  # NA, 4
-cls_loss_coef = 2  # 1.0, 2.0
-seg_loss_coef = 5  # 5.0, 5.0
-iou_loss_coef = 2  # 2.0, 2.0
-
-max_per_img = 100  # 300, 100
-lr = 0.0002  # 1e-4, 2e-4
-
-# model setting
-model = dict(
-    type='DeformableDETR',
-    num_queries=40,  # num_matching_queries, should be smaller than the window size
-    with_box_refine=True,
-    as_two_stage=True,
-    num_feature_levels=1,
-    data_preprocessor=dict(type='DetDataPreprocessor'),
-    backbone=dict(type='PseudoBackbone'),  # No backbone since we use pre-extracted features.
-    neck=dict(
-        type='ChannelMapper',
-        in_channels=[2048],
-        kernel_size=1,
-        out_channels=256,
-        act_cfg=None,
-        norm_cfg=dict(type='GN', num_groups=32),
-        num_outs=1),
-    encoder=dict(
-        num_layers=enc_layers,  # 6 for DeformableDETR
-        layer_cfg=dict(
-            self_attn_cfg=dict(
-                num_levels=1,
-                embed_dims=256,
-                batch_first=True),
-            ffn_cfg=dict(
-                embed_dims=256,
-                feedforward_channels=dim_feedforward,
-                ffn_drop=dropout))),
-    decoder=dict(
-        num_layers=dec_layers,  # 6 for DeformableDETR
-        return_intermediate=True,
-        layer_cfg=dict(
-            self_attn_cfg=dict(
-                embed_dims=256,
-                num_heads=8,
-                dropout=dropout,
-                batch_first=True),
-            cross_attn_cfg=dict(
-                embed_dims=256,
-                num_levels=1,
-                batch_first=True),
-            ffn_cfg=dict(
-                embed_dims=256,
-                feedforward_channels=dim_feedforward,
-                ffn_drop=dropout)),
-        post_norm_cfg=None),
-    positional_encoding=dict(num_feats=128, normalize=True, offset=-0.5, temperature=temperature),
-    bbox_head=dict(
-        type='DeformableDETRHead',
-        num_classes=20,
-        sync_cls_avg_factor=True,
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=cls_loss_coef),
-        loss_bbox=dict(type='CustomL1Loss', loss_weight=seg_loss_coef),  # customized to ignore y1, y2
-        loss_iou=dict(type='CustomGIoULoss', loss_weight=iou_loss_coef)),  # customized to ignore y1, y2
-    # training and testing settings
-    train_cfg=dict(
-        assigner=dict(
-            type='HungarianAssigner',
-            match_costs=[
-                dict(type='FocalLossCost', weight=2.0),
-                dict(type='CustomBBoxL1Cost', weight=5.0, box_format='xywh'),  # customized to ignore y1, y2
-                dict(type='CustomIoUCost', iou_mode='giou', weight=2.0)  # customized to ignore y1, y2
-            ])),
-    test_cfg=dict(max_per_img=max_per_img))
-
-# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
-# from the default setting in mmdet.
-
-# optimizer
-optim_wrapper = dict(
-    type='OptimWrapper',
-    optimizer=dict(
-        type='AdamW',
-        lr=lr,
-        weight_decay=0.0001),
-    clip_grad=dict(max_norm=0.1, norm_type=2),
-    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1),
-                                    'sampling_offsets': dict(lr_mult=0.1),
-                                    'reference_points': dict(lr_mult=0.1)
-                                    }))
-
-# learning policy
-# TadTR uses 30 epochs, but since we use random sliding windows rather than fixed overlapping windows,
-# we should increase the number of epochs to maximize utilization of the video content.
-max_epochs = 16  # 16 for TadTR
-train_cfg = dict(
-    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)  # 1 for TadTR
-
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-
-param_scheduler = [
-    dict(
-        type='MultiStepLR',
-        begin=0,
-        end=max_epochs,
-        by_epoch=True,
-        milestones=[14],  # 14 for TadTR
-        gamma=0.1)
-]
-
-# NOTE: `auto_scale_lr` is for automatically scaling LR,
-# USER SHOULD NOT CHANGE ITS VALUES.
-# base_batch_size = (8 GPUs) x (2 samples per GPU)
-auto_scale_lr = dict(base_batch_size=16)
-
-default_scope = 'mmdet'
-default_hooks = dict(
-    timer=dict(type='IterTimerHook'),
-    logger=dict(type='LoggerHook', interval=10),
-    param_scheduler=dict(type='ParamSchedulerHook'),
-    checkpoint=dict(type='CheckpointHook', interval=10),
-    sampler_seed=dict(type='DistSamplerSeedHook'),
-    visualization=dict(type='DetVisualizationHook'))
-
-env_cfg = dict(
-    cudnn_benchmark=False,
-    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
-    dist_cfg=dict(backend='nccl'),
-)
-
-vis_backends = [dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')]
-visualizer = dict(
-    type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer')
-log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
-
-log_level = 'INFO'
-load_from = None
-resume = False
diff --git a/configs/dino.py b/configs/dino.py
deleted file mode 100644
index 0cacc0b..0000000
--- a/configs/dino.py
+++ /dev/null
@@ -1,153 +0,0 @@
-_base_ = [
-    './thumos14.py'
-]
-custom_imports = dict(imports=['my_modules'], allow_failed_imports=False)
-
-# TadTR (based on DeFormableDETR) setting: (DINO, TadTR)
-enc_layers = 4          # 6, 4
-dec_layers = 4          # 6, 4
-dim_feedforward = 1024  # 2048, 1024
-dropout = 0.1           # 0.0, 0.1
-temperature = 10000     # 20, 10000
-
-act_loss_coef = 4       # NA, 4
-cls_loss_coef = 2       # 1.0, 2.0
-seg_loss_coef = 5       # 5.0, 5.0
-iou_loss_coef = 2       # 2.0, 2.0
-
-max_per_img = 100       # 300, 100
-lr = 0.0002             # 1e-4, 2e-4
-
-# model setting
-model = dict(
-    type='DINO',
-    num_queries=40,  # num_matching_queries, should be smaller than the window size
-    with_box_refine=True,
-    as_two_stage=True,
-    data_preprocessor=dict(type='DetDataPreprocessor'),
-    backbone=dict(type='PseudoBackbone'),  # No backbone since we use pre-extracted features.
-    neck=dict(
-        type='ChannelMapper',
-        in_channels=[2048],
-        kernel_size=1,
-        out_channels=256,
-        act_cfg=None,
-        norm_cfg=dict(type='GN', num_groups=32),
-        num_outs=1),
-    encoder=dict(
-        num_layers=enc_layers,
-        layer_cfg=dict(
-            self_attn_cfg=dict(embed_dims=256, num_levels=1,
-                               dropout=dropout),  # 0.1 for DeformDETR
-            ffn_cfg=dict(
-                embed_dims=256,
-                feedforward_channels=dim_feedforward,  # 1024 for DeformDETR
-                ffn_drop=dropout))),  # 0.1 for DeformDETR
-    decoder=dict(
-        num_layers=dec_layers,
-        return_intermediate=True,
-        layer_cfg=dict(
-            self_attn_cfg=dict(embed_dims=256, num_heads=8,
-                               dropout=dropout),  # 0.1 for DeformDETR
-            cross_attn_cfg=dict(embed_dims=256, num_levels=1,
-                                dropout=dropout),  # 0.1 for DeformDETR
-            ffn_cfg=dict(
-                embed_dims=256,
-                feedforward_channels=dim_feedforward,  # 1024 for DeformDETR
-                ffn_drop=dropout)),  # 0.1 for DeformDETR
-        post_norm_cfg=None),
-    positional_encoding=dict(
-        num_feats=128,
-        normalize=True,
-        offset=0.0,  # -0.5 for DeformDETR
-        temperature=temperature),  # 10000 for DeformDETR
-    bbox_head=dict(
-        type='CustomDINOHead',
-        num_classes=20,
-        sync_cls_avg_factor=True,
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=cls_loss_coef),  # 2.0 in DeformDETR
-        loss_bbox=dict(type='L1Loss', loss_weight=seg_loss_coef),
-        loss_iou=dict(type='GIoULoss', loss_weight=iou_loss_coef)),
-    dn_cfg=dict(  # TODO: Move to model.train_cfg ?
-        label_noise_scale=0.5,
-        box_noise_scale=0.4,  # 0.4 for DN-DETR
-        group_cfg=dict(dynamic=True, num_groups=None,
-                       num_dn_queries=50)),  # TODO: half num_dn_queries
-    # training and testing settings
-    train_cfg=dict(
-        assigner=dict(
-            type='HungarianAssigner',
-            match_costs=[
-                dict(type='FocalLossCost', weight=2.0),
-                dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
-                dict(type='IoUCost', iou_mode='giou', weight=2.0)
-            ])),
-    test_cfg=dict(max_per_img=max_per_img))  # 100 for DeformDETR
-
-# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
-# from the default setting in mmdet.
-
-# optimizer
-optim_wrapper = dict(
-    type='OptimWrapper',
-    optimizer=dict(
-        type='AdamW',
-        lr=lr,  # 0.0002 for DeformDETR
-        weight_decay=0.0001),
-    clip_grad=dict(max_norm=0.1, norm_type=2),
-    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)})
-)  # custom_keys contains sampling_offsets and reference_points in DeformDETR  # noqa
-
-# learning policy
-# TadTR uses 30 epochs, but since we use random sliding windows rather than fixed overlapping windows,
-# we should increase the number of epochs to maximize utilization of the video content.
-max_epochs = 80
-train_cfg = dict(
-    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=5)
-
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-
-param_scheduler = [
-    dict(
-        type='MultiStepLR',
-        begin=0,
-        end=max_epochs,
-        by_epoch=True,
-        milestones=[70],
-        gamma=0.1)
-]
-
-# NOTE: `auto_scale_lr` is for automatically scaling LR,
-# USER SHOULD NOT CHANGE ITS VALUES.
-# base_batch_size = (8 GPUs) x (2 samples per GPU)
-auto_scale_lr = dict(base_batch_size=16)
-
-default_scope = 'mmdet'
-default_hooks = dict(
-    timer=dict(type='IterTimerHook'),
-    logger=dict(type='LoggerHook', interval=10),
-    param_scheduler=dict(type='ParamSchedulerHook'),
-    checkpoint=dict(type='CheckpointHook', interval=10),
-    sampler_seed=dict(type='DistSamplerSeedHook'),
-    visualization=dict(type='DetVisualizationHook'))
-
-env_cfg = dict(
-    cudnn_benchmark=False,
-    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
-    dist_cfg=dict(backend='nccl'),
-)
-
-vis_backends = [dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')]
-visualizer = dict(
-    type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer')
-log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
-
-log_level = 'INFO'
-load_from = None
-resume = False
diff --git a/configs/dino_act.py b/configs/dino_act.py
deleted file mode 100644
index a86f258..0000000
--- a/configs/dino_act.py
+++ /dev/null
@@ -1,169 +0,0 @@
-_base_ = [
-    './thumos14.py'
-]
-custom_imports = dict(imports=['my_modules'], allow_failed_imports=False)
-
-# TadTR (based on DeFormableDETR) setting: (DINO, TadTR)
-enc_layers = 4          # 6, 4
-dec_layers = 4          # 6, 4
-dim_feedforward = 1024  # 2048, 1024
-dropout = 0.1           # 0.0, 0.1
-temperature = 10000     # 20, 10000
-
-act_loss_coef = 4       # NA, 4
-cls_loss_coef = 2       # 1.0, 2.0
-seg_loss_coef = 5       # 5.0, 5.0
-iou_loss_coef = 2       # 2.0, 2.0
-
-max_per_img = 100       # 300, 100
-lr = 0.0002             # 1e-4, 2e-4
-
-# model setting
-model = dict(
-    type='CustomDINO',
-    num_queries=40,  # num_matching_queries, should be smaller than the window size
-    with_box_refine=True,
-    as_two_stage=True,
-    data_preprocessor=dict(type='DetDataPreprocessor'),
-    backbone=dict(type='PseudoBackbone'),  # No backbone since we use pre-extracted features.
-    neck=dict(
-        type='ChannelMapper',
-        in_channels=[2048],
-        kernel_size=1,
-        out_channels=256,
-        act_cfg=None,
-        norm_cfg=dict(type='GN', num_groups=32),
-        num_outs=1),
-    encoder=dict(
-        num_layers=enc_layers,
-        layer_cfg=dict(
-            self_attn_cfg=dict(embed_dims=256, num_levels=1,
-                               dropout=dropout),  # 0.1 for DeformDETR
-            ffn_cfg=dict(
-                embed_dims=256,
-                feedforward_channels=dim_feedforward,  # 1024 for DeformDETR
-                ffn_drop=dropout))),  # 0.1 for DeformDETR
-    decoder=dict(
-        num_layers=dec_layers,
-        return_intermediate=True,
-        layer_cfg=dict(
-            self_attn_cfg=dict(embed_dims=256, num_heads=8,
-                               dropout=dropout),  # 0.1 for DeformDETR
-            cross_attn_cfg=dict(embed_dims=256, num_levels=1,
-                                dropout=dropout),  # 0.1 for DeformDETR
-            ffn_cfg=dict(
-                embed_dims=256,
-                feedforward_channels=dim_feedforward,  # 1024 for DeformDETR
-                ffn_drop=dropout)),  # 0.1 for DeformDETR
-        post_norm_cfg=None),
-    positional_encoding=dict(
-        num_feats=128,
-        normalize=True,
-        offset=0.0,  # -0.5 for DeformDETR
-        temperature=temperature),  # 10000 for DeformDETR
-    bbox_head=dict(
-        type='MyRoIHead',
-        bbox_roi_extractor=dict(
-            type='GenericRoIExtractor',
-            aggregation='sum',
-            roi_layer=dict(
-                type='RoIAlign',
-                output_size=(1, 16),
-                sampling_ratio=0,
-                aligned=True),
-            out_channels=256,
-            featmap_strides=[1],
-            pre_cfg=None,
-            post_cfg=None),
-        expand_roi_factor=1.5,
-        actionness_loss=dict(type='L1Loss', loss_weight=act_loss_coef),
-        bbox_head=dict(
-            type='CustomDINOHead',
-            num_classes=20,
-            sync_cls_avg_factor=True,
-            loss_cls=dict(
-                type='FocalLoss',
-                use_sigmoid=True,
-                gamma=2.0,
-                alpha=0.25,
-                loss_weight=cls_loss_coef),  # 2.0 in DeformDETR
-            loss_bbox=dict(type='L1Loss', loss_weight=seg_loss_coef),
-            loss_iou=dict(type='GIoULoss', loss_weight=iou_loss_coef))),
-    dn_cfg=dict(  # TODO: Move to model.train_cfg ?
-        label_noise_scale=0.5,
-        box_noise_scale=0.4,  # 0.4 for DN-DETR
-        group_cfg=dict(dynamic=True, num_groups=None,
-                       num_dn_queries=50)),  # TODO: half num_dn_queries
-    # training and testing settings
-    train_cfg=dict(
-        assigner=dict(
-            type='HungarianAssigner',
-            match_costs=[
-                dict(type='FocalLossCost', weight=2.0),
-                dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
-                dict(type='IoUCost', iou_mode='giou', weight=2.0)
-            ])),
-    test_cfg=dict(max_per_img=max_per_img))  # 100 for DeformDETR
-
-# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
-# from the default setting in mmdet.
-
-# optimizer
-optim_wrapper = dict(
-    type='OptimWrapper',
-    optimizer=dict(
-        type='AdamW',
-        lr=lr,  # 0.0002 for DeformDETR
-        weight_decay=0.0001),
-    clip_grad=dict(max_norm=0.1, norm_type=2),
-    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)})
-)  # custom_keys contains sampling_offsets and reference_points in DeformDETR  # noqa
-
-# learning policy
-# TadTR uses 30 epochs, but since we use random sliding windows rather than fixed overlapping windows,
-# we should increase the number of epochs to maximize utilization of the video content.
-max_epochs = 80
-train_cfg = dict(
-    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=5)
-
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-
-param_scheduler = [
-    dict(
-        type='MultiStepLR',
-        begin=0,
-        end=max_epochs,
-        by_epoch=True,
-        milestones=[70],
-        gamma=0.1)
-]
-
-# NOTE: `auto_scale_lr` is for automatically scaling LR,
-# USER SHOULD NOT CHANGE ITS VALUES.
-# base_batch_size = (8 GPUs) x (2 samples per GPU)
-auto_scale_lr = dict(base_batch_size=16)
-
-default_scope = 'mmdet'
-default_hooks = dict(
-    timer=dict(type='IterTimerHook'),
-    logger=dict(type='LoggerHook', interval=10),
-    param_scheduler=dict(type='ParamSchedulerHook'),
-    checkpoint=dict(type='CheckpointHook', interval=10),
-    sampler_seed=dict(type='DistSamplerSeedHook'),
-    visualization=dict(type='DetVisualizationHook'))
-
-env_cfg = dict(
-    cudnn_benchmark=False,
-    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
-    dist_cfg=dict(backend='nccl'),
-)
-
-vis_backends = [dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')]
-visualizer = dict(
-    type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer')
-log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
-
-log_level = 'INFO'
-load_from = None
-resume = False
diff --git a/configs/tadtr.py b/configs/tadtr.py
index 9f22685..fc85bae 100644
--- a/configs/tadtr.py
+++ b/configs/tadtr.py
@@ -1,3 +1,4 @@
+# I an
 _base_ = [
     './thumos14.py'
 ]
@@ -68,33 +69,17 @@
     # num_feats=128, offset=-0.5 for DeformableDETR, but 256, 0 for TadTR, we cannot set to 256 because of y-axis
     positional_encoding=dict(num_feats=128, normalize=True, offset=0, temperature=temperature),
     bbox_head=dict(
-        type='MyRoIHead',
-        bbox_roi_extractor=dict(
-            type='GenericRoIExtractor',
-            aggregation='sum',
-            roi_layer=dict(
-                type='RoIAlign',
-                output_size=(1, 16),
-                sampling_ratio=0,
-                aligned=True),
-            out_channels=256,
-            featmap_strides=[1],
-            pre_cfg=None,
-            post_cfg=None),
-        expand_roi_factor=1.5,
-        actionness_loss=dict(type='L1Loss', loss_weight=act_loss_coef),
-        bbox_head=dict(
-            type='CustomDeformableDETRHead',
-            num_classes=20,
-            sync_cls_avg_factor=True,
-            loss_cls=dict(
-                type='FocalLoss',
-                use_sigmoid=True,
-                gamma=2.0,
-                alpha=0.25,
-                loss_weight=cls_loss_coef),
-            loss_bbox=dict(type='CustomL1Loss', loss_weight=seg_loss_coef),
-            loss_iou=dict(type='CustomIoULoss', mode='linear', loss_weight=iou_loss_coef))),  # -log(GIoU) for DeformableDETR
+        type='CustomDeformableDETRHead',
+        num_classes=20,
+        sync_cls_avg_factor=True,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=cls_loss_coef),
+        loss_bbox=dict(type='CustomL1Loss', loss_weight=seg_loss_coef),
+        loss_iou=dict(type='CustomIoULoss', mode='linear', loss_weight=iou_loss_coef)),  # -log(GIoU) for DeformableDETR
     # training and testing settings
     train_cfg=dict(
         assigner=dict(
@@ -115,7 +100,7 @@
     optimizer=dict(
         type='AdamW',
         lr=lr,
-        weight_decay=0.0001),   # 0.0001 by default
+        weight_decay=0.0001),  # 0.0001 by default
     clip_grad=dict(max_norm=0.1, norm_type=2),
     paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1),
                                     'sampling_offsets': dict(lr_mult=0.1),
diff --git a/configs/tadtr_my.py b/configs/tadtr_my.py
index b0d4bdf..8fee421 100644
--- a/configs/tadtr_my.py
+++ b/configs/tadtr_my.py
@@ -36,10 +36,11 @@
                  window_stride=192,  # overlap=0.25
                  data_prefix=dict(feat='features/thumos_feat_ActionFormer_16input_4stride_2048/i3d_features')))
 
-# 3. Use multi-scale features via multi-level temporal 1d convolutions
+# 3. Use multi-level features via temporal 1d convolution layers
+# model setting
 model = dict(
     num_feature_levels=4,
-    backbone=dict(type='PseudoBackbone', multi_scale=False),
+    backbone=dict(type='PseudoBackbone', multi_scale=False),  # No backbone since we use pre-extracted features.
     neck=[
         dict(
             type='DownSampler1D',
@@ -55,14 +56,7 @@
             out_channels=256,
             act_cfg=None,
             norm_cfg=dict(type='GN', num_groups=32),
-            num_outs=4)
-        # dict(type='FPN',
-        #      in_channels=[2048, 512, 512, 512, 512, 512],
-        #      out_channels=256,
-        #      num_outs=6,
-        #      conv_cfg=dict(type='Conv1d'),
-        #      norm_cfg=dict(type='SyncBN')),
-    ],
+            num_outs=4)],
     encoder=dict(layer_cfg=dict(self_attn_cfg=dict(num_levels=4))),
     decoder=dict(layer_cfg=dict(cross_attn_cfg=dict(num_levels=4)))
 )
diff --git a/my_modules/detector/__init__.py b/my_modules/detector/__init__.py
index 945a172..28964f0 100644
--- a/my_modules/detector/__init__.py
+++ b/my_modules/detector/__init__.py
@@ -1,5 +1,2 @@
-# from .base_detr import *
-# from .deformable_detr import *
-from .dino import *
 from .deformable_detr import *
 from .pseudo_backbone import *
diff --git a/my_modules/detector/deformable_detr.py b/my_modules/detector/deformable_detr.py
index 3b47eeb..2873935 100644
--- a/my_modules/detector/deformable_detr.py
+++ b/my_modules/detector/deformable_detr.py
@@ -1,11 +1,5 @@
-from typing import Dict, Tuple
-
-import torch
 from mmdet.models.detectors import DeformableDETR
 from mmdet.registry import MODELS
-from mmdet.structures import OptSampleList
-from torch import Tensor
-from torch.nn import functional as F
 
 from my_modules.layers.pseudo_layers import Pseudo2DLinear
 from my_modules.loss.positional_encoding import PositionEmbeddingSine
@@ -31,26 +25,3 @@ def _init_layers(self) -> None:
             **pos_cfg)
         if not self.as_two_stage:
             self.reference_points_fc = Pseudo2DLinear(self.embed_dims, 1)
-
-
-    def forward_transformer(self,
-                            img_feats: Tuple[Tensor],
-                            batch_data_samples: OptSampleList = None) -> Dict:
-        encoder_inputs_dict, decoder_inputs_dict = self.pre_transformer(
-            img_feats, batch_data_samples)
-
-        encoder_outputs_dict = self.forward_encoder(**encoder_inputs_dict)
-
-        tmp_dec_in, head_inputs_dict = self.pre_decoder(**encoder_outputs_dict)
-        decoder_inputs_dict.update(tmp_dec_in)
-
-        decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict)
-        head_inputs_dict.update(decoder_outputs_dict)
-
-        # louis: input encoder memory into2 the head for RoIAlign and actionness regression.
-        memory = encoder_outputs_dict['memory']
-        level_start_index = decoder_inputs_dict['level_start_index'].cpu()
-        # memory [N, W, C] -> [N, C, W] -> [N, C, 1, W] -> split on last dimension (W)
-        mlvl_memory = torch.tensor_split(memory.transpose(1, 2).unsqueeze(2), level_start_index[1:], dim=-1)
-        head_inputs_dict['memory'] = mlvl_memory
-        return head_inputs_dict
diff --git a/my_modules/detector/dino.py b/my_modules/detector/dino.py
deleted file mode 100644
index 18c0748..0000000
--- a/my_modules/detector/dino.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from typing import Dict, Tuple
-
-import torch
-from mmdet.models.detectors import DINO
-from mmdet.registry import MODELS
-from mmdet.structures import OptSampleList
-from torch import Tensor
-
-
-@MODELS.register_module()
-class CustomDINO(DINO):
-    """
-    The Customized DINO that input memory (encoder output) into the head.
-    This customized DINO output memory into the head for the RoI purpose,
-    without any functional changes from the original DINO.
-    """
-
-    def forward_transformer(
-            self,
-            img_feats: Tuple[Tensor],
-            batch_data_samples: OptSampleList = None,
-    ) -> Dict:
-        """Forward process of Transformer.
-
-        The forward procedure of the transformer is defined as:
-        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
-        More details can be found at `TransformerDetector.forward_transformer`
-        in `mmdet/detector/base_detr.py`.
-        The difference is that the ground truth in `batch_data_samples` is
-        required for the `pre_decoder` to prepare the query of DINO.
-        Additionally, DINO inherits the `pre_transformer` method and the
-        `forward_encoder` method of DeformableDETR. More details about the
-        two methods can be found in `mmdet/detector/deformable_detr.py`.
-
-        Args:
-            img_feats (tuple[Tensor]): Tuple of feature maps from neck. Each
-                feature map has shape (bs, dim, H, W).
-            batch_data_samples (list[:obj:`DetDataSample`]): The batch
-                data samples. It usually includes information such
-                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
-                Defaults to None.
-
-        Returns:
-            dict: The dictionary of bbox_head function inputs, which always
-            includes the `hidden_states` of the decoder output and may contain
-            `references` including the initial and intermediate references.
-        """
-        encoder_inputs_dict, decoder_inputs_dict = self.pre_transformer(
-            img_feats, batch_data_samples)
-
-        encoder_outputs_dict = self.forward_encoder(**encoder_inputs_dict)
-
-        tmp_dec_in, head_inputs_dict = self.pre_decoder(
-            **encoder_outputs_dict, batch_data_samples=batch_data_samples)
-        decoder_inputs_dict.update(tmp_dec_in)
-
-        decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict)
-        head_inputs_dict.update(decoder_outputs_dict)
-
-        # louis: input encoder memory into the head for RoIAlign and actionness regression.
-        memory = encoder_outputs_dict['memory']
-        level_start_index = decoder_inputs_dict['level_start_index'].cpu()
-        # memory [N, W, C] -> [N, C, W] -> [N, C, 1, W] -> split on last dimension (W)
-        mlvl_memory = torch.tensor_split(memory.transpose(1, 2).unsqueeze(2), level_start_index[1:], dim=-1)
-        head_inputs_dict['memory'] = mlvl_memory
-        return head_inputs_dict
diff --git a/my_modules/head/__init__.py b/my_modules/head/__init__.py
index 4461f3a..6c96a0c 100644
--- a/my_modules/head/__init__.py
+++ b/my_modules/head/__init__.py
@@ -2,4 +2,3 @@
 # from .detr_head import *
 from .dino_head import *
 from .deformable_detr_head import *
-from .roi_head import *
diff --git a/my_modules/head/deformable_detr_head.py b/my_modules/head/deformable_detr_head.py
index 5209694..9bda616 100644
--- a/my_modules/head/deformable_detr_head.py
+++ b/my_modules/head/deformable_detr_head.py
@@ -1,10 +1,6 @@
-from typing import List, Tuple
-
 from mmdet.models.dense_heads import DeformableDETRHead
 from mmdet.registry import MODELS
-from mmdet.structures import SampleList
-from mmdet.utils import InstanceList
-from torch import nn, Tensor
+from torch import nn
 
 from my_modules.layers.pseudo_layers import Pseudo4DLinear
 
@@ -29,24 +25,3 @@ def init_weights(self) -> None:
         if self.as_two_stage:
             for m in self.reg_branches:
                 nn.init.constant_(m[-1].bias.data[1:], 0.0)  # [2:] -> [1:]
-
-    def loss_and_predict(
-            self,
-            hidden_states: Tensor, references: List[Tensor],
-            enc_outputs_class: Tensor, enc_outputs_coord: Tensor,
-            batch_data_samples: SampleList,
-            rescale: bool = False) -> Tuple[dict, InstanceList]:
-        batch_gt_instances = []
-        batch_img_metas = []
-        for data_sample in batch_data_samples:
-            batch_img_metas.append(data_sample.metainfo)
-            batch_gt_instances.append(data_sample.gt_instances)
-
-        outs = self(hidden_states, references)
-        loss_inputs = outs + (enc_outputs_class, enc_outputs_coord,
-                              batch_gt_instances, batch_img_metas)
-        losses = self.loss_by_feat(*loss_inputs)
-
-        predictions = self.predict_by_feat(
-            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
-        return losses, predictions
diff --git a/my_modules/head/roi_head.py b/my_modules/head/roi_head.py
deleted file mode 100644
index 2d283ab..0000000
--- a/my_modules/head/roi_head.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-from typing import List
-
-import torch
-from mmdet.registry import MODELS
-from mmdet.structures import DetDataSample
-from mmdet.structures.bbox import bbox2roi, bbox_overlaps
-from mmdet.utils import ConfigType, InstanceList, OptMultiConfig
-from mmengine.model import BaseModule
-from torch import nn
-
-
-@MODELS.register_module()
-class MyRoIHead(BaseModule):
-    """Simplest base roi head including one bbox head and one mask head."""
-
-    def __init__(self,
-                 bbox_roi_extractor: ConfigType,
-                 actionness_loss: ConfigType,
-                 bbox_head: ConfigType,
-                 init_cfg: OptMultiConfig = None,
-                 expand_roi_factor=1.5,
-                 active=True,  # experimental arguments, set to False can deactivate the RoI
-                 **kwargs) -> None:
-        super().__init__(init_cfg=init_cfg)
-        self.active = active
-        if active:
-            self.bbox_roi_extractor = MODELS.build(bbox_roi_extractor)
-            self.roi_size = bbox_roi_extractor['roi_layer']['output_size'][1]
-            self.dim = bbox_roi_extractor['out_channels']
-            self.expand_roi_factor = expand_roi_factor
-            self.actionness_fc = nn.Sequential(
-                nn.Flatten(start_dim=1),
-                nn.Linear(self.roi_size * self.dim, self.dim),
-                nn.ReLU(inplace=True),
-                nn.Linear(self.dim, self.dim),
-                nn.ReLU(inplace=True),
-                nn.Linear(self.dim, 1),
-                nn.Sigmoid())
-            self.actionness_loss = MODELS.build(actionness_loss)
-        else:
-            warnings.warn("Please note that the RoIHead is now deactivated, no RoI will be applied")
-
-        bbox_head.update(kwargs)
-        self.bbox_head = MODELS.build(bbox_head)
-
-    @property
-    def num_classes(self):
-        return self.bbox_head.num_classes
-
-    @property
-    def reg_branches(self):
-        return self.bbox_head.reg_branches
-
-    @property
-    def cls_branches(self):
-        return self.bbox_head.cls_branches
-
-    def forward(self, *args, **kwargs) -> tuple:
-        return self.bbox_head(*args, **kwargs)
-
-    def loss(self, batch_data_samples: List[DetDataSample], **head_inputs_dict) -> dict:
-        memory = head_inputs_dict.pop('memory')
-
-        if self.active:
-            bbox_head_loss, bbox_pred = self.bbox_head.loss_and_predict(batch_data_samples=batch_data_samples,
-                                                                        rescale=False,
-                                                                        **head_inputs_dict)
-            actionness_pred = self.actionness_forward(memory, bbox_pred, batch_data_samples)
-            actionness_target = self.get_actionness_target(bbox_pred, batch_data_samples)
-            actionness_loss = self.actionness_loss(actionness_pred.reshape(-1),
-                                                   torch.cat(actionness_target, 0))
-            bbox_head_loss.update(dict(actionness_loss=actionness_loss))
-        else:
-            bbox_head_loss = self.bbox_head.loss(batch_data_samples=batch_data_samples,
-                                                 **head_inputs_dict)
-
-        return bbox_head_loss
-
-    def predict(self, batch_data_samples: List[DetDataSample], rescale, **head_inputs_dict) -> InstanceList:
-        memory = head_inputs_dict.pop('memory')
-        bbox_pred = self.bbox_head.predict(batch_data_samples=batch_data_samples,
-                                           rescale=False,
-                                           **head_inputs_dict)
-        if self.active:
-            actionness_pred = self.actionness_forward(memory, bbox_pred, batch_data_samples).reshape(len(bbox_pred), -1)
-        else:
-            actionness_pred = [pred.scores for pred in bbox_pred]
-        return self.post_process(bbox_pred, actionness_pred, batch_data_samples, rescale)
-
-    def actionness_forward(self, memory, bbox_pred, batch_data_samples):
-        # Expand the range of (x1, x2)
-        ex_bbox_pred = [res.bboxes.clone().detach() for res in bbox_pred]
-        for bboxes, data_sample in zip(ex_bbox_pred, batch_data_samples):
-            max_len = data_sample.metainfo['img_shape'][1]
-            length = bboxes[:, 2] - bboxes[:, 0]
-            center = (bboxes[:, 2] + bboxes[:, 0]) / 2
-            bboxes[:, 0] = (center - length * self.expand_roi_factor / 2).clamp(min=0, max=max_len)
-            bboxes[:, 2] = (center + length * self.expand_roi_factor / 2).clamp(min=0, max=max_len)
-
-        # actionness regression prediction
-        rois = bbox2roi(ex_bbox_pred).detach()
-        bbox_feats = self.bbox_roi_extractor(memory[:self.bbox_roi_extractor.num_inputs], rois)
-        actionness_pred = self.actionness_fc(bbox_feats)
-        return actionness_pred
-
-    @staticmethod
-    def get_actionness_target(bbox_pred, batch_data_samples):
-        batch_bboxes = [res.bboxes for res in bbox_pred]
-        batch_gt_bboxes = [data_sample.gt_instances.bboxes for data_sample in batch_data_samples]
-
-        # Fix the y1 y2
-        for res in bbox_pred:
-            res.bboxes[:, 1] = 0.1
-            res.bboxes[:, 3] = 0.9
-
-        actionness_target = []
-        for bboxes, gt_bboxes in zip(batch_bboxes, batch_gt_bboxes):
-            iou_mat = bbox_overlaps(bboxes, gt_bboxes, mode='iou', is_aligned=False)
-            gt_iou = iou_mat.max(dim=1)[0]
-            actionness_target.append(gt_iou.detach())
-
-        return actionness_target
-
-    @staticmethod
-    def post_process(bbox_pred, actionness_pred, batch_data_samples, rescale):
-        for pred, data_sample, actionness in zip(bbox_pred, batch_data_samples, actionness_pred):
-            img_meta = data_sample.metainfo
-            if rescale:
-                assert img_meta.get('scale_factor') is not None
-                pred.bboxes /= pred.bboxes.new_tensor(
-                    img_meta['scale_factor']).repeat((1, 2))
-                # using actionness regression results as confidence scores of bboxes instead of classification score
-                pred.scores = torch.sqrt(pred.scores * actionness)
-        return bbox_pred
diff --git a/my_modules/layers/__init__.py b/my_modules/layers/__init__.py
index 3d6a90d..cb911c3 100644
--- a/my_modules/layers/__init__.py
+++ b/my_modules/layers/__init__.py
@@ -1,3 +1,2 @@
 from .custom_layers import *
-from .channel_mapper import *
 from .pseudo_layers import *
diff --git a/my_modules/neck/__init__.py b/my_modules/neck/__init__.py
index 709bada..80085d3 100644
--- a/my_modules/neck/__init__.py
+++ b/my_modules/neck/__init__.py
@@ -1 +1,2 @@
 from .temporal_downsampler import *
+from .channel_mapper import *
diff --git a/my_modules/layers/channel_mapper.py b/my_modules/neck/channel_mapper.py
similarity index 100%
rename from my_modules/layers/channel_mapper.py
rename to my_modules/neck/channel_mapper.py