-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Feature] Add Tokens-to-Token ViT backbone and converted checkpoints. (…
…#467) * add t2t backbone * register t2t_vit * add t2t_vit config * [Temp] Align posterize transform with timm. * Fix lint * Refactor t2t-vit * Add config for t2t-vit * Add metafile and README for t2t-vit * Add unit tests * configs * Update metafile and README * Improve docstring * Fix batch size which should be 8x64 instead of 8x128 * Fix typo * Update model zoo * Update training augments config. * Move some arguments of T2TModule to T2TViT * Update docs. * Update unit test Co-authored-by: HIT-cwh <[email protected]>
- Loading branch information
Showing
15 changed files
with
844 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
_base_ = ['./pipelines/rand_aug.py'] | ||
|
||
# dataset settings | ||
dataset_type = 'ImageNet' | ||
img_norm_cfg = dict( | ||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) | ||
|
||
train_pipeline = [ | ||
dict(type='LoadImageFromFile'), | ||
dict( | ||
type='RandomResizedCrop', | ||
size=224, | ||
backend='pillow', | ||
interpolation='bicubic'), | ||
dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), | ||
dict( | ||
type='RandAugment', | ||
policies={{_base_.rand_increasing_policies}}, | ||
num_policies=2, | ||
total_level=10, | ||
magnitude_level=9, | ||
magnitude_std=0.5, | ||
hparams=dict( | ||
pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]], | ||
interpolation='bicubic')), | ||
dict( | ||
type='RandomErasing', | ||
erase_prob=0.25, | ||
mode='rand', | ||
min_area_ratio=0.02, | ||
max_area_ratio=1 / 3, | ||
fill_color=img_norm_cfg['mean'][::-1], | ||
fill_std=img_norm_cfg['std'][::-1]), | ||
dict(type='Normalize', **img_norm_cfg), | ||
dict(type='ImageToTensor', keys=['img']), | ||
dict(type='ToTensor', keys=['gt_label']), | ||
dict(type='Collect', keys=['img', 'gt_label']) | ||
] | ||
|
||
test_pipeline = [ | ||
dict(type='LoadImageFromFile'), | ||
dict( | ||
type='Resize', | ||
size=(248, -1), | ||
backend='pillow', | ||
interpolation='bicubic'), | ||
dict(type='CenterCrop', crop_size=224), | ||
dict(type='Normalize', **img_norm_cfg), | ||
dict(type='ImageToTensor', keys=['img']), | ||
dict(type='Collect', keys=['img']) | ||
] | ||
data = dict( | ||
samples_per_gpu=64, | ||
workers_per_gpu=4, | ||
train=dict( | ||
type=dataset_type, | ||
data_prefix='data/imagenet/train', | ||
pipeline=train_pipeline), | ||
val=dict( | ||
type=dataset_type, | ||
data_prefix='data/imagenet/val', | ||
ann_file='data/imagenet/meta/val.txt', | ||
pipeline=test_pipeline), | ||
test=dict( | ||
# replace `data/val` with `data/test` for standard test | ||
type=dataset_type, | ||
data_prefix='data/imagenet/val', | ||
ann_file='data/imagenet/meta/val.txt', | ||
pipeline=test_pipeline)) | ||
|
||
evaluation = dict(interval=10, metric='accuracy') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# model settings | ||
embed_dims = 384 | ||
num_classes = 1000 | ||
|
||
model = dict( | ||
type='ImageClassifier', | ||
backbone=dict( | ||
type='T2T_ViT', | ||
img_size=224, | ||
in_channels=3, | ||
embed_dims=embed_dims, | ||
t2t_cfg=dict( | ||
token_dims=64, | ||
use_performer=False, | ||
), | ||
num_layers=14, | ||
layer_cfgs=dict( | ||
num_heads=6, | ||
feedforward_channels=3 * embed_dims, # mlp_ratio = 3 | ||
), | ||
drop_path_rate=0.1, | ||
init_cfg=[ | ||
dict(type='TruncNormal', layer='Linear', std=.02), | ||
dict(type='Constant', layer='LayerNorm', val=1., bias=0.), | ||
]), | ||
neck=None, | ||
head=dict( | ||
type='VisionTransformerClsHead', | ||
num_classes=num_classes, | ||
in_channels=embed_dims, | ||
loss=dict( | ||
type='LabelSmoothLoss', | ||
label_smooth_val=0.1, | ||
mode='original', | ||
), | ||
topk=(1, 5), | ||
init_cfg=dict(type='TruncNormal', layer='Linear', std=.02)), | ||
train_cfg=dict(augments=[ | ||
dict(type='BatchMixup', alpha=0.8, prob=0.5, num_classes=num_classes), | ||
dict(type='BatchCutMix', alpha=1.0, prob=0.5, num_classes=num_classes), | ||
])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# model settings | ||
embed_dims = 448 | ||
num_classes = 1000 | ||
|
||
model = dict( | ||
type='ImageClassifier', | ||
backbone=dict( | ||
type='T2T_ViT', | ||
img_size=224, | ||
in_channels=3, | ||
embed_dims=embed_dims, | ||
t2t_cfg=dict( | ||
token_dims=64, | ||
use_performer=False, | ||
), | ||
num_layers=19, | ||
layer_cfgs=dict( | ||
num_heads=7, | ||
feedforward_channels=3 * embed_dims, # mlp_ratio = 3 | ||
), | ||
drop_path_rate=0.1, | ||
init_cfg=[ | ||
dict(type='TruncNormal', layer='Linear', std=.02), | ||
dict(type='Constant', layer='LayerNorm', val=1., bias=0.), | ||
]), | ||
neck=None, | ||
head=dict( | ||
type='VisionTransformerClsHead', | ||
num_classes=num_classes, | ||
in_channels=embed_dims, | ||
loss=dict( | ||
type='LabelSmoothLoss', | ||
label_smooth_val=0.1, | ||
mode='original', | ||
), | ||
topk=(1, 5), | ||
init_cfg=dict(type='TruncNormal', layer='Linear', std=.02)), | ||
train_cfg=dict(augments=[ | ||
dict(type='BatchMixup', alpha=0.8, prob=0.5, num_classes=num_classes), | ||
dict(type='BatchCutMix', alpha=1.0, prob=0.5, num_classes=num_classes), | ||
])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# model settings | ||
embed_dims = 512 | ||
num_classes = 1000 | ||
|
||
model = dict( | ||
type='ImageClassifier', | ||
backbone=dict( | ||
type='T2T_ViT', | ||
img_size=224, | ||
in_channels=3, | ||
embed_dims=embed_dims, | ||
t2t_cfg=dict( | ||
token_dims=64, | ||
use_performer=False, | ||
), | ||
num_layers=24, | ||
layer_cfgs=dict( | ||
num_heads=8, | ||
feedforward_channels=3 * embed_dims, # mlp_ratio = 3 | ||
), | ||
drop_path_rate=0.1, | ||
init_cfg=[ | ||
dict(type='TruncNormal', layer='Linear', std=.02), | ||
dict(type='Constant', layer='LayerNorm', val=1., bias=0.), | ||
]), | ||
neck=None, | ||
head=dict( | ||
type='VisionTransformerClsHead', | ||
num_classes=num_classes, | ||
in_channels=embed_dims, | ||
loss=dict( | ||
type='LabelSmoothLoss', | ||
label_smooth_val=0.1, | ||
mode='original', | ||
), | ||
topk=(1, 5), | ||
init_cfg=dict(type='TruncNormal', layer='Linear', std=.02)), | ||
train_cfg=dict(augments=[ | ||
dict(type='BatchMixup', alpha=0.8, prob=0.5, num_classes=num_classes), | ||
dict(type='BatchCutMix', alpha=1.0, prob=0.5, num_classes=num_classes), | ||
])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet | ||
<!-- {Tokens-to-Token ViT} --> | ||
|
||
## Introduction | ||
|
||
<!-- [ALGORITHM] --> | ||
|
||
```latex | ||
@article{yuan2021tokens, | ||
title={Tokens-to-token vit: Training vision transformers from scratch on imagenet}, | ||
author={Yuan, Li and Chen, Yunpeng and Wang, Tao and Yu, Weihao and Shi, Yujun and Tay, Francis EH and Feng, Jiashi and Yan, Shuicheng}, | ||
journal={arXiv preprint arXiv:2101.11986}, | ||
year={2021} | ||
} | ||
``` | ||
|
||
## Pretrain model | ||
|
||
The pre-trained modles are converted from [official repo](https://github.com/yitu-opensource/T2T-ViT/tree/main#2-t2t-vit-models). | ||
|
||
### ImageNet-1k | ||
|
||
| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Download | | ||
|:--------------:|:---------:|:--------:|:---------:|:---------:|:--------:| | ||
| T2T-ViT_t-14\* | 21.47 | 4.34 | 81.69 | 95.85 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/t2t_vit/t2t-vit-t-14_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_3rdparty_8xb64_in1k_20210928-b7c09b62.pth) | [log]()| | ||
| T2T-ViT_t-19\* | 39.08 | 7.80 | 82.43 | 96.08 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/t2t_vit/t2t-vit-t-19_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_3rdparty_8xb64_in1k_20210928-7f1478d5.pth) | [log]()| | ||
| T2T-ViT_t-24\* | 64.00 | 12.69 | 82.55 | 96.06 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/t2t_vit/t2t-vit-t-24_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_3rdparty_8xb64_in1k_20210928-fe95a61b.pth) | [log]()| | ||
|
||
*Models with \* are converted from other repos.* | ||
|
||
## Results and models | ||
|
||
Waiting for adding. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
Collections: | ||
- Name: Tokens-to-Token ViT | ||
Metadata: | ||
Training Data: ImageNet-1k | ||
Architecture: | ||
- Layer Normalization | ||
- Scaled Dot-Product Attention | ||
- Attention Dropout | ||
- Dropout | ||
- Tokens to Token | ||
Paper: | ||
URL: https://arxiv.org/abs/2101.11986 | ||
Title: "Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet" | ||
README: configs/t2t_vit/README.md | ||
|
||
Models: | ||
- Name: t2t-vit-t-14_3rdparty_8xb64_in1k | ||
Metadata: | ||
FLOPs: 4340000000 | ||
Parameters: 21470000 | ||
In Collection: Tokens-to-Token ViT | ||
Results: | ||
- Dataset: ImageNet-1k | ||
Metrics: | ||
Top 1 Accuracy: 81.69 | ||
Top 5 Accuracy: 95.85 | ||
Task: Image Classification | ||
Weights: https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_3rdparty_8xb64_in1k_20210928-b7c09b62.pth | ||
Converted From: | ||
Weights: https://github.com/yitu-opensource/T2T-ViT/releases/download/main/81.7_T2T_ViTt_14.pth.tar | ||
Code: https://github.com/yitu-opensource/T2T-ViT/blob/main/models/t2t_vit.py#L243 | ||
Config: configs/t2t_vit/t2t-vit-t-14_8xb64_in1k.py | ||
- Name: t2t-vit-t-19_3rdparty_8xb64_in1k | ||
Metadata: | ||
FLOPs: 7800000000 | ||
Parameters: 39080000 | ||
In Collection: Tokens-to-Token ViT | ||
Results: | ||
- Dataset: ImageNet-1k | ||
Metrics: | ||
Top 1 Accuracy: 82.43 | ||
Top 5 Accuracy: 96.08 | ||
Task: Image Classification | ||
Weights: https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_3rdparty_8xb64_in1k_20210928-7f1478d5.pth | ||
Converted From: | ||
Weights: https://github.com/yitu-opensource/T2T-ViT/releases/download/main/82.4_T2T_ViTt_19.pth.tar | ||
Code: https://github.com/yitu-opensource/T2T-ViT/blob/main/models/t2t_vit.py#L254 | ||
Config: configs/t2t_vit/t2t-vit-t-19_8xb64_in1k.py | ||
- Name: t2t-vit-t-24_3rdparty_8xb64_in1k | ||
Metadata: | ||
FLOPs: 12690000000 | ||
Parameters: 64000000 | ||
In Collection: Tokens-to-Token ViT | ||
Results: | ||
- Dataset: ImageNet-1k | ||
Metrics: | ||
Top 1 Accuracy: 82.55 | ||
Top 5 Accuracy: 96.06 | ||
Task: Image Classification | ||
Weights: https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_3rdparty_8xb64_in1k_20210928-fe95a61b.pth | ||
Converted From: | ||
Weights: https://github.com/yitu-opensource/T2T-ViT/releases/download/main/82.6_T2T_ViTt_24.pth.tar | ||
Code: https://github.com/yitu-opensource/T2T-ViT/blob/main/models/t2t_vit.py#L265 | ||
Config: configs/t2t_vit/t2t-vit-t-24_8xb64_in1k.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
_base_ = [ | ||
'../_base_/models/t2t-vit-t-14.py', | ||
'../_base_/datasets/imagenet_bs64_t2t_224.py', | ||
'../_base_/default_runtime.py', | ||
] | ||
|
||
# optimizer | ||
paramwise_cfg = dict( | ||
bias_decay_mult=0.0, | ||
custom_keys={'.backbone.cls_token': dict(decay_mult=0.0)}, | ||
) | ||
optimizer = dict( | ||
type='AdamW', | ||
lr=5e-4, | ||
weight_decay=0.05, | ||
paramwise_cfg=paramwise_cfg, | ||
) | ||
optimizer_config = dict(grad_clip=None) | ||
|
||
# learning policy | ||
# FIXME: lr in the first 300 epochs conforms to the CosineAnnealing and | ||
# the lr in the last 10 epoch equals to min_lr | ||
lr_config = dict( | ||
policy='CosineAnnealing', | ||
min_lr=1e-5, | ||
by_epoch=True, | ||
warmup_by_epoch=True, | ||
warmup='linear', | ||
warmup_iters=10, | ||
warmup_ratio=1e-6) | ||
runner = dict(type='EpochBasedRunner', max_epochs=310) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
_base_ = [ | ||
'../_base_/models/t2t-vit-t-19.py', | ||
'../_base_/datasets/imagenet_bs64_t2t_224.py', | ||
'../_base_/default_runtime.py', | ||
] | ||
|
||
# optimizer | ||
paramwise_cfg = dict( | ||
bias_decay_mult=0.0, | ||
custom_keys={'.backbone.cls_token': dict(decay_mult=0.0)}, | ||
) | ||
optimizer = dict( | ||
type='AdamW', | ||
lr=5e-4, | ||
weight_decay=0.065, | ||
paramwise_cfg=paramwise_cfg, | ||
) | ||
optimizer_config = dict(grad_clip=None) | ||
|
||
# learning policy | ||
# FIXME: lr in the first 300 epochs conforms to the CosineAnnealing and | ||
# the lr in the last 10 epoch equals to min_lr | ||
lr_config = dict( | ||
policy='CosineAnnealing', | ||
min_lr=1e-5, | ||
by_epoch=True, | ||
warmup_by_epoch=True, | ||
warmup='linear', | ||
warmup_iters=10, | ||
warmup_ratio=1e-6) | ||
runner = dict(type='EpochBasedRunner', max_epochs=310) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
_base_ = [ | ||
'../_base_/models/t2t-vit-t-24.py', | ||
'../_base_/datasets/imagenet_bs64_t2t_224.py', | ||
'../_base_/default_runtime.py', | ||
] | ||
|
||
# optimizer | ||
paramwise_cfg = dict( | ||
bias_decay_mult=0.0, | ||
custom_keys={'.backbone.cls_token': dict(decay_mult=0.0)}, | ||
) | ||
optimizer = dict( | ||
type='AdamW', | ||
lr=5e-4, | ||
weight_decay=0.065, | ||
paramwise_cfg=paramwise_cfg, | ||
) | ||
optimizer_config = dict(grad_clip=None) | ||
|
||
# learning policy | ||
# FIXME: lr in the first 300 epochs conforms to the CosineAnnealing and | ||
# the lr in the last 10 epoch equals to min_lr | ||
lr_config = dict( | ||
policy='CosineAnnealing', | ||
min_lr=1e-5, | ||
by_epoch=True, | ||
warmup_by_epoch=True, | ||
warmup='linear', | ||
warmup_iters=10, | ||
warmup_ratio=1e-6) | ||
runner = dict(type='EpochBasedRunner', max_epochs=310) |
Oops, something went wrong.