Skip to content

Commit

Permalink
[Feature] Add Tokens-to-Token ViT backbone and converted checkpoints. (
Browse files Browse the repository at this point in the history
…#467)

* add t2t backbone

* register t2t_vit

* add t2t_vit config

* [Temp] Align posterize transform with timm.

* Fix lint

* Refactor t2t-vit

* Add config for t2t-vit

* Add metafile and README for t2t-vit

* Add unit tests

* configs

* Update metafile and README

* Improve docstring

* Fix batch size which should be 8x64 instead of 8x128

* Fix typo

* Update model zoo

* Update training augments config.

* Move some arguments of T2TModule to T2TViT

* Update docs.

* Update unit test

Co-authored-by: HIT-cwh <[email protected]>
  • Loading branch information
mzr1996 and HIT-cwh authored Oct 29, 2021
1 parent 2ce5825 commit fffa30d
Show file tree
Hide file tree
Showing 15 changed files with 844 additions and 2 deletions.
71 changes: 71 additions & 0 deletions configs/_base_/datasets/imagenet_bs64_t2t_224.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
_base_ = ['./pipelines/rand_aug.py']

# dataset settings
dataset_type = 'ImageNet'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
size=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies={{_base_.rand_increasing_policies}},
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(
pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]],
interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=1 / 3,
fill_color=img_norm_cfg['mean'][::-1],
fill_std=img_norm_cfg['std'][::-1]),
dict(type='Normalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(type='ToTensor', keys=['gt_label']),
dict(type='Collect', keys=['img', 'gt_label'])
]

test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='Resize',
size=(248, -1),
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
]
data = dict(
samples_per_gpu=64,
workers_per_gpu=4,
train=dict(
type=dataset_type,
data_prefix='data/imagenet/train',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
data_prefix='data/imagenet/val',
ann_file='data/imagenet/meta/val.txt',
pipeline=test_pipeline),
test=dict(
# replace `data/val` with `data/test` for standard test
type=dataset_type,
data_prefix='data/imagenet/val',
ann_file='data/imagenet/meta/val.txt',
pipeline=test_pipeline))

evaluation = dict(interval=10, metric='accuracy')
41 changes: 41 additions & 0 deletions configs/_base_/models/t2t-vit-t-14.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# model settings
embed_dims = 384
num_classes = 1000

model = dict(
type='ImageClassifier',
backbone=dict(
type='T2T_ViT',
img_size=224,
in_channels=3,
embed_dims=embed_dims,
t2t_cfg=dict(
token_dims=64,
use_performer=False,
),
num_layers=14,
layer_cfgs=dict(
num_heads=6,
feedforward_channels=3 * embed_dims, # mlp_ratio = 3
),
drop_path_rate=0.1,
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
]),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=num_classes,
in_channels=embed_dims,
loss=dict(
type='LabelSmoothLoss',
label_smooth_val=0.1,
mode='original',
),
topk=(1, 5),
init_cfg=dict(type='TruncNormal', layer='Linear', std=.02)),
train_cfg=dict(augments=[
dict(type='BatchMixup', alpha=0.8, prob=0.5, num_classes=num_classes),
dict(type='BatchCutMix', alpha=1.0, prob=0.5, num_classes=num_classes),
]))
41 changes: 41 additions & 0 deletions configs/_base_/models/t2t-vit-t-19.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# model settings
embed_dims = 448
num_classes = 1000

model = dict(
type='ImageClassifier',
backbone=dict(
type='T2T_ViT',
img_size=224,
in_channels=3,
embed_dims=embed_dims,
t2t_cfg=dict(
token_dims=64,
use_performer=False,
),
num_layers=19,
layer_cfgs=dict(
num_heads=7,
feedforward_channels=3 * embed_dims, # mlp_ratio = 3
),
drop_path_rate=0.1,
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
]),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=num_classes,
in_channels=embed_dims,
loss=dict(
type='LabelSmoothLoss',
label_smooth_val=0.1,
mode='original',
),
topk=(1, 5),
init_cfg=dict(type='TruncNormal', layer='Linear', std=.02)),
train_cfg=dict(augments=[
dict(type='BatchMixup', alpha=0.8, prob=0.5, num_classes=num_classes),
dict(type='BatchCutMix', alpha=1.0, prob=0.5, num_classes=num_classes),
]))
41 changes: 41 additions & 0 deletions configs/_base_/models/t2t-vit-t-24.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# model settings
embed_dims = 512
num_classes = 1000

model = dict(
type='ImageClassifier',
backbone=dict(
type='T2T_ViT',
img_size=224,
in_channels=3,
embed_dims=embed_dims,
t2t_cfg=dict(
token_dims=64,
use_performer=False,
),
num_layers=24,
layer_cfgs=dict(
num_heads=8,
feedforward_channels=3 * embed_dims, # mlp_ratio = 3
),
drop_path_rate=0.1,
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
]),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=num_classes,
in_channels=embed_dims,
loss=dict(
type='LabelSmoothLoss',
label_smooth_val=0.1,
mode='original',
),
topk=(1, 5),
init_cfg=dict(type='TruncNormal', layer='Linear', std=.02)),
train_cfg=dict(augments=[
dict(type='BatchMixup', alpha=0.8, prob=0.5, num_classes=num_classes),
dict(type='BatchCutMix', alpha=1.0, prob=0.5, num_classes=num_classes),
]))
33 changes: 33 additions & 0 deletions configs/t2t_vit/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet
<!-- {Tokens-to-Token ViT} -->

## Introduction

<!-- [ALGORITHM] -->

```latex
@article{yuan2021tokens,
title={Tokens-to-token vit: Training vision transformers from scratch on imagenet},
author={Yuan, Li and Chen, Yunpeng and Wang, Tao and Yu, Weihao and Shi, Yujun and Tay, Francis EH and Feng, Jiashi and Yan, Shuicheng},
journal={arXiv preprint arXiv:2101.11986},
year={2021}
}
```

## Pretrain model

The pre-trained modles are converted from [official repo](https://github.com/yitu-opensource/T2T-ViT/tree/main#2-t2t-vit-models).

### ImageNet-1k

| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Download |
|:--------------:|:---------:|:--------:|:---------:|:---------:|:--------:|
| T2T-ViT_t-14\* | 21.47 | 4.34 | 81.69 | 95.85 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/t2t_vit/t2t-vit-t-14_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_3rdparty_8xb64_in1k_20210928-b7c09b62.pth) &#124; [log]()|
| T2T-ViT_t-19\* | 39.08 | 7.80 | 82.43 | 96.08 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/t2t_vit/t2t-vit-t-19_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_3rdparty_8xb64_in1k_20210928-7f1478d5.pth) &#124; [log]()|
| T2T-ViT_t-24\* | 64.00 | 12.69 | 82.55 | 96.06 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/t2t_vit/t2t-vit-t-24_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_3rdparty_8xb64_in1k_20210928-fe95a61b.pth) &#124; [log]()|

*Models with \* are converted from other repos.*

## Results and models

Waiting for adding.
64 changes: 64 additions & 0 deletions configs/t2t_vit/metafile.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
Collections:
- Name: Tokens-to-Token ViT
Metadata:
Training Data: ImageNet-1k
Architecture:
- Layer Normalization
- Scaled Dot-Product Attention
- Attention Dropout
- Dropout
- Tokens to Token
Paper:
URL: https://arxiv.org/abs/2101.11986
Title: "Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet"
README: configs/t2t_vit/README.md

Models:
- Name: t2t-vit-t-14_3rdparty_8xb64_in1k
Metadata:
FLOPs: 4340000000
Parameters: 21470000
In Collection: Tokens-to-Token ViT
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 81.69
Top 5 Accuracy: 95.85
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_3rdparty_8xb64_in1k_20210928-b7c09b62.pth
Converted From:
Weights: https://github.com/yitu-opensource/T2T-ViT/releases/download/main/81.7_T2T_ViTt_14.pth.tar
Code: https://github.com/yitu-opensource/T2T-ViT/blob/main/models/t2t_vit.py#L243
Config: configs/t2t_vit/t2t-vit-t-14_8xb64_in1k.py
- Name: t2t-vit-t-19_3rdparty_8xb64_in1k
Metadata:
FLOPs: 7800000000
Parameters: 39080000
In Collection: Tokens-to-Token ViT
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 82.43
Top 5 Accuracy: 96.08
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_3rdparty_8xb64_in1k_20210928-7f1478d5.pth
Converted From:
Weights: https://github.com/yitu-opensource/T2T-ViT/releases/download/main/82.4_T2T_ViTt_19.pth.tar
Code: https://github.com/yitu-opensource/T2T-ViT/blob/main/models/t2t_vit.py#L254
Config: configs/t2t_vit/t2t-vit-t-19_8xb64_in1k.py
- Name: t2t-vit-t-24_3rdparty_8xb64_in1k
Metadata:
FLOPs: 12690000000
Parameters: 64000000
In Collection: Tokens-to-Token ViT
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 82.55
Top 5 Accuracy: 96.06
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_3rdparty_8xb64_in1k_20210928-fe95a61b.pth
Converted From:
Weights: https://github.com/yitu-opensource/T2T-ViT/releases/download/main/82.6_T2T_ViTt_24.pth.tar
Code: https://github.com/yitu-opensource/T2T-ViT/blob/main/models/t2t_vit.py#L265
Config: configs/t2t_vit/t2t-vit-t-24_8xb64_in1k.py
31 changes: 31 additions & 0 deletions configs/t2t_vit/t2t-vit-t-14_8xb64_in1k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
_base_ = [
'../_base_/models/t2t-vit-t-14.py',
'../_base_/datasets/imagenet_bs64_t2t_224.py',
'../_base_/default_runtime.py',
]

# optimizer
paramwise_cfg = dict(
bias_decay_mult=0.0,
custom_keys={'.backbone.cls_token': dict(decay_mult=0.0)},
)
optimizer = dict(
type='AdamW',
lr=5e-4,
weight_decay=0.05,
paramwise_cfg=paramwise_cfg,
)
optimizer_config = dict(grad_clip=None)

# learning policy
# FIXME: lr in the first 300 epochs conforms to the CosineAnnealing and
# the lr in the last 10 epoch equals to min_lr
lr_config = dict(
policy='CosineAnnealing',
min_lr=1e-5,
by_epoch=True,
warmup_by_epoch=True,
warmup='linear',
warmup_iters=10,
warmup_ratio=1e-6)
runner = dict(type='EpochBasedRunner', max_epochs=310)
31 changes: 31 additions & 0 deletions configs/t2t_vit/t2t-vit-t-19_8xb64_in1k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
_base_ = [
'../_base_/models/t2t-vit-t-19.py',
'../_base_/datasets/imagenet_bs64_t2t_224.py',
'../_base_/default_runtime.py',
]

# optimizer
paramwise_cfg = dict(
bias_decay_mult=0.0,
custom_keys={'.backbone.cls_token': dict(decay_mult=0.0)},
)
optimizer = dict(
type='AdamW',
lr=5e-4,
weight_decay=0.065,
paramwise_cfg=paramwise_cfg,
)
optimizer_config = dict(grad_clip=None)

# learning policy
# FIXME: lr in the first 300 epochs conforms to the CosineAnnealing and
# the lr in the last 10 epoch equals to min_lr
lr_config = dict(
policy='CosineAnnealing',
min_lr=1e-5,
by_epoch=True,
warmup_by_epoch=True,
warmup='linear',
warmup_iters=10,
warmup_ratio=1e-6)
runner = dict(type='EpochBasedRunner', max_epochs=310)
31 changes: 31 additions & 0 deletions configs/t2t_vit/t2t-vit-t-24_8xb64_in1k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
_base_ = [
'../_base_/models/t2t-vit-t-24.py',
'../_base_/datasets/imagenet_bs64_t2t_224.py',
'../_base_/default_runtime.py',
]

# optimizer
paramwise_cfg = dict(
bias_decay_mult=0.0,
custom_keys={'.backbone.cls_token': dict(decay_mult=0.0)},
)
optimizer = dict(
type='AdamW',
lr=5e-4,
weight_decay=0.065,
paramwise_cfg=paramwise_cfg,
)
optimizer_config = dict(grad_clip=None)

# learning policy
# FIXME: lr in the first 300 epochs conforms to the CosineAnnealing and
# the lr in the last 10 epoch equals to min_lr
lr_config = dict(
policy='CosineAnnealing',
min_lr=1e-5,
by_epoch=True,
warmup_by_epoch=True,
warmup='linear',
warmup_iters=10,
warmup_ratio=1e-6)
runner = dict(type='EpochBasedRunner', max_epochs=310)
Loading

0 comments on commit fffa30d

Please sign in to comment.