From fbd00248f51b79fa930b873447a8c22664c0d399 Mon Sep 17 00:00:00 2001 From: sallysyw Date: Tue, 12 Oct 2021 01:06:54 +0000 Subject: [PATCH 01/23] [vit] Adding ViT to torchvision/models --- .../ModelTester.test_vit_b_16_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_vit_b_32_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_vit_l_16_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_vit_l_32_expect.pkl | Bin 0 -> 939 bytes torchvision/models/__init__.py | 1 + torchvision/models/vision_transformer.py | 307 ++++++++++++++++++ 6 files changed, 308 insertions(+) create mode 100644 test/expect/ModelTester.test_vit_b_16_expect.pkl create mode 100644 test/expect/ModelTester.test_vit_b_32_expect.pkl create mode 100644 test/expect/ModelTester.test_vit_l_16_expect.pkl create mode 100644 test/expect/ModelTester.test_vit_l_32_expect.pkl create mode 100644 torchvision/models/vision_transformer.py diff --git a/test/expect/ModelTester.test_vit_b_16_expect.pkl b/test/expect/ModelTester.test_vit_b_16_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..946a75a30ed7d9d101838e03852ed5326532471f GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5~rqWolRzSn$7A{vus4#a&4l|oZe&gNp<%e*&ds^)f;VIIj^))b^WnNOK#7e zl3j0i@0z*5=F_cvd*<(IuvxphWw-rjPTR`YbFKgC1>4MR-);SKtEWxM(&>8yZCG~c zm2}$7>F%)EQ^aP~kh?c1iE&J$WnJY}P(MY_spx zc^je9iMuL1KkWJX{FqI`<^47u4v%bBHw*9CB00n6Uy`$(+dGLJBzu)0hin^Tm1T zp-ezqK{&vh5k$e$Byt=IfFw`=dJ09?jqE2r6rHbtJY?Pa2IzW`UB!>0R|4olm|kcY z1bDNt=|C09G3&yWgAy|c!07D|F2f|SCqdq21LX|{PpAS=CJ69mWdn&Z10hH~L@fYy CpZli( literal 0 HcmV?d00001 diff --git a/test/expect/ModelTester.test_vit_b_32_expect.pkl b/test/expect/ModelTester.test_vit_b_32_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c6f338093eaf5ef60d140af7337048fef1276578 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5+`!%-91&xyKE%FTJ~6Zp4oHNz;4g(?@_z2c+T9zd1j)G{rsagKX`BN*|dA5 zjl$1qdrlV|ue&1_|JA0KzjBYu&GR+_K|5`BH+)PMDEhWPCoK@wpHSKP+*}P86W}6<@UWIc_d)V4q zcQxFevB$^LZVxE5oCCT~oB)Ot2;&Ydegg|;6e&H2-BDgWb?&& z>7h(OTR}L$n-N68( zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5=SEc=bjIreQl2X;kOCc-LdC^>K&VmN){WQeT{qO={E0C==#5#K}OYL#+`q= zPw_3Vsjy+(^WmMKjm4pWJs%7|>`~bA!&)bPn~jgN${wlu^)?dphDK@Ai^-z31NFBR10# zzuJf#^4YWP`OZBKE*EY7FnHN8WFFdC6FSpIW5TgL2l|+-9ga@1nvwj*+TiZR-3g|;6e&H2-BDgWb?&& z>7h(OTR}L$n-N68(wby literal 0 HcmV?d00001 diff --git a/test/expect/ModelTester.test_vit_l_32_expect.pkl b/test/expect/ModelTester.test_vit_l_32_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0b2c4674332ad96afecad560f7c2f6e1718f3fe6 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5@$vJJZlxVhkLp5S@bTOeN1}}{ad%^UdzWlTTCa}Y?vFq z`^&Frn+4sGyWdP&VY8v}yS0_jfju10Z|rU<_+nGFaK|2wl5e{&>~h%C5TUg@%4Uj9 z8())+*O4BZ2{w0a@;orA+@ZzKU=0tgvecqtU@W*fnG+dYNFfJd8gqeczBn&E zlnH1n2nTpGf+%>JM2 (n, hidden_dim, n_h, n_w) + x = self.conv_proj(x) + # (n, hidden_dim, n_h, n_w) -> (n, hidden_dim, (n_h * n_w)) + x = x.reshape(n, self.hidden_dim, n_h * n_w) + + # (n, hidden_dim, (n_h * n_w)) -> ((n_h * n_w), n, hidden_dim) + # The self attention layer expects inputs in the format (S, N, E) + # where S is the source sequence length, N is the batch size, E is the + # embedding dimension + x = x.permute(2, 0, 1) + + if self.classifier == "token": + # Expand the class token to the full batch. + batch_class_token = self.class_token.expand(-1, n, -1) + x = torch.cat([batch_class_token, x], dim=0) + + x = self.encoder(x) + + if self.classifier == "token": + # Classifier as used by standard language architectures + x = x[0, :, :] + elif self.classifier == "gap": + # Classifier as used by standard vision architectures + x = x.mean(dim=0) + else: + raise ValueError(f"Invalid classifier={self.classifier}") + + x = self.head(x) + + return x + + +def _vision_transformer(version: str, pretrained: bool, progress: bool, **kwargs: Any) -> VisionTransformer: + if kwargs.get("image_size", None) is None: + model = VisionTransformer(image_size=224, **kwargs) + else: + model = VisionTransformer(**kwargs) + # TODO: Adding pre-trained models + return model + + +def vit_b_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: + """ + Constructs a ViT_b_16 architecture from + `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _vision_transformer( + version="b_16", + pretrained=pretrained, + progress=progress, + patch_size=16, + num_layers=12, + num_heads=12, + hidden_dim=768, + mlp_dim=3072, + **kwargs, + ) + + +def vit_b_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: + """ + Constructs a ViT_b_32 architecture from + `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _vision_transformer( + version="b_32", + pretrained=pretrained, + progress=progress, + patch_size=32, + num_layers=12, + num_heads=12, + hidden_dim=768, + mlp_dim=3072, + **kwargs, + ) + + +def vit_l_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: + """ + Constructs a ViT_l_16 architecture from + `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _vision_transformer( + version="l_16", + pretrained=pretrained, + progress=progress, + patch_size=16, + num_layers=24, + num_heads=16, + hidden_dim=1024, + mlp_dim=4096, + **kwargs, + ) + + +def vit_l_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: + """ + Constructs a ViT_l_32 architecture from + `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _vision_transformer( + version="l_32", + pretrained=pretrained, + progress=progress, + patch_size=32, + num_layers=24, + num_heads=16, + hidden_dim=1024, + mlp_dim=4096, + **kwargs, + ) From 7521ffe02bfc025adc27d52b55b5104426f4d6a9 Mon Sep 17 00:00:00 2001 From: sallysyw Date: Wed, 20 Oct 2021 01:30:01 +0000 Subject: [PATCH 02/23] adding pre-logits layer + resolving comments --- .../ModelTester.test_vit_b_16_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_vit_b_32_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_vit_h_14_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_vit_l_16_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_vit_l_32_expect.pkl | Bin 939 -> 939 bytes torchvision/models/vision_transformer.py | 147 ++++++++++++------ 6 files changed, 98 insertions(+), 49 deletions(-) create mode 100644 test/expect/ModelTester.test_vit_h_14_expect.pkl diff --git a/test/expect/ModelTester.test_vit_b_16_expect.pkl b/test/expect/ModelTester.test_vit_b_16_expect.pkl index 946a75a30ed7d9d101838e03852ed5326532471f..1f846beb6a0bccf8b545f5a67b74482015cc878b 100644 GIT binary patch delta 230 ocmZ3@zM6f517khIz=A-kSs&od&QZ3a@Z@9xrVA`!#uKLF0QWr%IRF3v delta 230 zcmVxK+TOgMx_~^btAx8h^9en9>zzCOE>k?6hr2uZwMjf=rI)-B zJOsKfbcsBii-b~uZmEVmt>?o$ zzUt3B5_OTfcuDZQ_2E|tAi7~v>BK@{$eXU9)$Ef221HX`HsXq7*aJn grz#dacg_htP)i30uqv(VlMn*X1h6Wt>yzXH$0!bOO#lD@ diff --git a/test/expect/ModelTester.test_vit_b_32_expect.pkl b/test/expect/ModelTester.test_vit_b_32_expect.pkl index c6f338093eaf5ef60d140af7337048fef1276578..1f846beb6a0bccf8b545f5a67b74482015cc878b 100644 GIT binary patch delta 230 ocmZ3@zM6f517khIz=A-kSs&od&QZ3a@Z@9xrVA`!#uKLF0QWr%IRF3v delta 230 zcmVQn@_4 zfbhJ=X6(GKq7J=FT?@T@=!`oJo6|h|YH&IM!YVsc_*%T?{`yh8+~mZ(MLOHOhS%^r zA*HXpK9aJ$PZb$FNg#>5 zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5(jATumGUYDqB%_@&wQ~AdEY-_!+F>p;eYzR1Ay-Hz#ul>i!MRpZGie3qz3t@Vp zVG!WW#-;;RB*&}^R}MzLU$#6WetK*f|gm}BfaFx}C+01vIX g40~3*sBuHQP)i30^6UX`lMn*X1oG?wZzz zHIh8Aomac``dd7pi&?wulBhhef%iK)62QC&=h(Z1aP&NSp}4#VboRT@xV&y0vX-BP)i30Pn{%alMn*X1W%nLXp`gu$9ukVR{#J2 diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py index 2bfcd70c258..48e44839bdf 100644 --- a/torchvision/models/vision_transformer.py +++ b/torchvision/models/vision_transformer.py @@ -8,7 +8,7 @@ import math from collections import OrderedDict from functools import partial -from typing import Any +from typing import Any, Callable, Optional import torch import torch.nn as nn @@ -20,25 +20,23 @@ "vit_b_32", "vit_l_16", "vit_l_32", + "vit_h_14", ] -LayerNorm = partial(nn.LayerNorm, eps=1e-6) - - class MLPBlock(nn.Sequential): """Transformer MLP block.""" - def __init__(self, in_dim: int, mlp_dim: int, dropout_rate: float): + def __init__(self, in_dim: int, mlp_dim: int, dropout: float): super().__init__() self.linear_1 = nn.Linear(in_dim, mlp_dim) self.act = nn.GELU() - self.dropout_1 = nn.Dropout(dropout_rate) + self.dropout_1 = nn.Dropout(dropout) self.linear_2 = nn.Linear(mlp_dim, in_dim) - self.dropout_2 = nn.Dropout(dropout_rate) - self.init_weights() + self.dropout_2 = nn.Dropout(dropout) + self._init_weights() - def init_weights(self): + def _init_weights(self): nn.init.xavier_uniform_(self.linear_1.weight) nn.init.xavier_uniform_(self.linear_2.weight) nn.init.normal_(self.linear_1.bias, std=1e-6) @@ -49,22 +47,28 @@ class EncoderBlock(nn.Module): """Transformer encoder block.""" def __init__( - self, num_heads: int, hidden_dim: int, mlp_dim: int, dropout_rate: float, attention_dropout_rate: float + self, + num_heads: int, + hidden_dim: int, + mlp_dim: int, + dropout: float, + attention_dropout: float, + norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6), ): super().__init__() self.num_heads = num_heads # Attention block - self.ln_1 = LayerNorm(hidden_dim) - self.self_attention = nn.MultiheadAttention(hidden_dim, num_heads, dropout=attention_dropout_rate) - self.dropout = nn.Dropout(dropout_rate) + self.ln_1 = norm_layer(hidden_dim) + self.self_attention = nn.MultiheadAttention(hidden_dim, num_heads, dropout=attention_dropout) + self.dropout = nn.Dropout(dropout) # MLP block - self.ln_2 = LayerNorm(hidden_dim) - self.mlp = MLPBlock(hidden_dim, mlp_dim, dropout_rate) + self.ln_2 = norm_layer(hidden_dim) + self.mlp = MLPBlock(hidden_dim, mlp_dim, dropout) def forward(self, input: Tensor): - # assert input.dim() == 3, f"Expected (seq_length, batch_size, hidden_dim) got {input.shape}" + torch._assert(input.dim() == 3, f"Expected (seq_length, batch_size, hidden_dim) got {input.shape}") x = self.ln_1(input) x, _ = self.self_attention(query=x, key=x, value=x, need_weights=False) x = self.dropout(x) @@ -85,28 +89,30 @@ def __init__( num_heads: int, hidden_dim: int, mlp_dim: int, - dropout_rate: float, - attention_dropout_rate: float, + dropout: float, + attention_dropout: float, + norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6), ): super().__init__() # Note that batch_size is on the second dim because # we have batch_first=False in nn.MultiAttention() by default self.pos_embedding = nn.Parameter(torch.empty(seq_length, 1, hidden_dim).normal_(std=0.02)) # from BERT - self.dropout = nn.Dropout(dropout_rate) + self.dropout = nn.Dropout(dropout) layers: OrderedDict[str, nn.Module] = OrderedDict() for i in range(num_layers): layers[f"encoder_layer_{i}"] = EncoderBlock( num_heads, hidden_dim, mlp_dim, - dropout_rate, - attention_dropout_rate, + dropout, + attention_dropout, + norm_layer, ) self.layers = nn.Sequential(layers) - self.ln = LayerNorm(hidden_dim) + self.ln = norm_layer(hidden_dim) def forward(self, input: Tensor): - # assert input.dim() == 3, f"Expected (seq_length, batch_size, hidden_dim) got {input.shape}" + torch._assert(input.dim() == 3, f"Expected (seq_length, batch_size, hidden_dim) got {input.shape}") input = input + self.pos_embedding return self.ln(self.layers(self.dropout(input))) @@ -122,22 +128,26 @@ def __init__( num_heads: int, hidden_dim: int, mlp_dim: int, - dropout_rate: float = 0.0, - attention_dropout_rate: float = 0.0, + dropout: float = 0.0, + attention_dropout: float = 0.0, classifier: str = "token", num_classes: int = 1000, + representation_size: Optional[int] = None, + norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6), ): super().__init__() - # assert image_size % patch_size == 0, "Input shape indivisible by patch size!" - # assert classifier in ["token", "gap"], "Unexpected classifier mode!" + torch._assert(image_size % patch_size == 0, "Input shape indivisible by patch size!") + torch._assert(classifier in ["token", "gap"], "Unexpected classifier mode!") self.image_size = image_size self.patch_size = patch_size self.hidden_dim = hidden_dim self.mlp_dim = mlp_dim - self.attention_dropout_rate = attention_dropout_rate - self.dropout_rate = dropout_rate + self.attention_dropout = attention_dropout + self.dropout = dropout self.classifier = classifier self.num_classes = num_classes + self.representation_size = representation_size + self.norm_layer = norm_layer input_channels = 3 @@ -157,24 +167,41 @@ def __init__( num_heads, hidden_dim, mlp_dim, - dropout_rate, - attention_dropout_rate, + dropout, + attention_dropout, + norm_layer, ) self.seq_length = seq_length - self.head = nn.Linear(hidden_dim, num_classes) - self.init_weights() + heads_layers: OrderedDict[str, nn.Module] = OrderedDict() + if representation_size is None: + heads_layers["head"] = nn.Linear(hidden_dim, num_classes) + else: + heads_layers["pre_logits"] = nn.Linear(hidden_dim, representation_size) + heads_layers["act"] = nn.Tanh() + heads_layers["head"] = nn.Linear(representation_size, num_classes) + + self.heads = nn.Sequential(heads_layers) + self._init_weights() - def init_weights(self): + def _init_weights(self): fan_in = self.conv_proj.in_channels * self.conv_proj.kernel_size[0] * self.conv_proj.kernel_size[1] nn.init.trunc_normal_(self.conv_proj.weight, std=math.sqrt(1 / fan_in)) nn.init.zeros_(self.conv_proj.bias) - nn.init.zeros_(self.head.weight) + + if hasattr(self.heads, "pre_logits"): + fan_in = self.layers.pre_logits.in_features + nn.init.trunc_normal_(self.layers.pre_logits.weight, std=math.sqrt(1 / fan_in)) + nn.init.zeros_(self.heads.pre_logits.bias) + + nn.init.zeros_(self.heads.head.weight) + nn.init.zeros_(self.heads.head.bias) def forward(self, x: torch.Tensor): n, c, h, w = x.shape p = self.patch_size - # assert h == w == self.image_size + torch._assert(h == self.image_size, "Wrong image height!") + torch._assert(w == self.image_size, "Wrong image width!") n_h = h // p n_w = w // p @@ -205,16 +232,16 @@ def forward(self, x: torch.Tensor): else: raise ValueError(f"Invalid classifier={self.classifier}") - x = self.head(x) + x = self.heads(x) return x def _vision_transformer(version: str, pretrained: bool, progress: bool, **kwargs: Any) -> VisionTransformer: - if kwargs.get("image_size", None) is None: - model = VisionTransformer(image_size=224, **kwargs) - else: - model = VisionTransformer(**kwargs) + image_size = kwargs.get("image_size", 224) + if "image_size" in kwargs: + kwargs.pop("image_size") + model = VisionTransformer(image_size=image_size, **kwargs) # TODO: Adding pre-trained models return model @@ -225,8 +252,8 @@ def vit_b_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr + pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ return _vision_transformer( version="b_16", @@ -247,8 +274,8 @@ def vit_b_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr + pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ return _vision_transformer( version="b_32", @@ -269,8 +296,8 @@ def vit_l_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr + pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ return _vision_transformer( version="l_16", @@ -291,8 +318,8 @@ def vit_l_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr + pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ return _vision_transformer( version="l_32", @@ -305,3 +332,25 @@ def vit_l_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> mlp_dim=4096, **kwargs, ) + + +def vit_h_14(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: + """ + Constructs a ViT_h_14 architecture from + `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. + + Args: + pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. + """ + return _vision_transformer( + version="h_14", + pretrained=pretrained, + progress=progress, + patch_size=14, + num_layers=32, + num_heads=16, + hidden_dim=1280, + mlp_dim=5120, + **kwargs, + ) From 53b696769f9b34d34fcf3121d657fe6f7c3f23ac Mon Sep 17 00:00:00 2001 From: sallysyw Date: Tue, 26 Oct 2021 00:14:03 +0000 Subject: [PATCH 03/23] Fix the model attribute bug --- torchvision/models/vision_transformer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py index 48e44839bdf..2171cb5fce7 100644 --- a/torchvision/models/vision_transformer.py +++ b/torchvision/models/vision_transformer.py @@ -190,8 +190,8 @@ def _init_weights(self): nn.init.zeros_(self.conv_proj.bias) if hasattr(self.heads, "pre_logits"): - fan_in = self.layers.pre_logits.in_features - nn.init.trunc_normal_(self.layers.pre_logits.weight, std=math.sqrt(1 / fan_in)) + fan_in = self.heads.pre_logits.in_features + nn.init.trunc_normal_(self.heads.pre_logits.weight, std=math.sqrt(1 / fan_in)) nn.init.zeros_(self.heads.pre_logits.bias) nn.init.zeros_(self.heads.head.weight) @@ -285,7 +285,7 @@ def vit_b_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> num_layers=12, num_heads=12, hidden_dim=768, - mlp_dim=3072, + mlp_dim=3072, **kwargs, ) From a84361a8c5370937b31040d1362238673c2d1e98 Mon Sep 17 00:00:00 2001 From: sallysyw Date: Tue, 26 Oct 2021 18:30:11 +0000 Subject: [PATCH 04/23] Change version to arch --- torchvision/models/vision_transformer.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py index 2171cb5fce7..c62592b25d3 100644 --- a/torchvision/models/vision_transformer.py +++ b/torchvision/models/vision_transformer.py @@ -1,6 +1,3 @@ -# Implement ViT from: -# https://arxiv.org/abs/2010.11929 - # References: # https://github.com/google-research/vision_transformer # https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/vision_transformer.py @@ -237,7 +234,7 @@ def forward(self, x: torch.Tensor): return x -def _vision_transformer(version: str, pretrained: bool, progress: bool, **kwargs: Any) -> VisionTransformer: +def _vision_transformer(arch: str, pretrained: bool, progress: bool, **kwargs: Any) -> VisionTransformer: image_size = kwargs.get("image_size", 224) if "image_size" in kwargs: kwargs.pop("image_size") @@ -256,7 +253,7 @@ def vit_b_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ return _vision_transformer( - version="b_16", + arch="b_16", pretrained=pretrained, progress=progress, patch_size=16, @@ -278,14 +275,14 @@ def vit_b_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ return _vision_transformer( - version="b_32", + arch="b_32", pretrained=pretrained, progress=progress, patch_size=32, num_layers=12, num_heads=12, hidden_dim=768, - mlp_dim=3072, + mlp_dim=3072, **kwargs, ) @@ -300,7 +297,7 @@ def vit_l_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ return _vision_transformer( - version="l_16", + arch="l_16", pretrained=pretrained, progress=progress, patch_size=16, @@ -322,7 +319,7 @@ def vit_l_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ return _vision_transformer( - version="l_32", + arch="l_32", pretrained=pretrained, progress=progress, patch_size=32, @@ -344,7 +341,7 @@ def vit_h_14(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ return _vision_transformer( - version="h_14", + arch="h_14", pretrained=pretrained, progress=progress, patch_size=14, From c2f3826995f308f722b342fa7e58ab2b22e466d8 Mon Sep 17 00:00:00 2001 From: sallysyw Date: Sat, 6 Nov 2021 00:32:15 +0000 Subject: [PATCH 05/23] fix failing unittests --- test/test_backbone_utils.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/test/test_backbone_utils.py b/test/test_backbone_utils.py index 9b46bdd5288..8434f16e746 100644 --- a/test/test_backbone_utils.py +++ b/test/test_backbone_utils.py @@ -89,7 +89,17 @@ def _create_feature_extractor(self, *args, **kwargs): def _get_return_nodes(self, model): set_rng_seed(0) - exclude_nodes_filter = ["getitem", "floordiv", "size", "chunk"] + exclude_nodes_filter = [ + "getitem", + "floordiv", + "size", + "chunk", + "_assert", + "eq", + "dim", + "getattr", + "self_attention", + ] train_nodes, eval_nodes = get_graph_node_names( model, tracer_kwargs={"leaf_modules": self.leaf_modules}, suppress_diff_warning=True ) @@ -140,6 +150,8 @@ def test_node_name_conventions(self): def test_forward_backward(self, model_name): model = models.__dict__[model_name](**self.model_defaults).train() train_return_nodes, eval_return_nodes = self._get_return_nodes(model) + print("train_return_nodes = ", train_return_nodes) + print("eval_return_nodes = ", eval_return_nodes) model = self._create_feature_extractor( model, train_return_nodes=train_return_nodes, eval_return_nodes=eval_return_nodes ) From 35c1d22e1318b0f97dceca74b51321bac8f0a5c7 Mon Sep 17 00:00:00 2001 From: sallysyw Date: Sat, 6 Nov 2021 00:33:36 +0000 Subject: [PATCH 06/23] remove useless prints --- test/test_backbone_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/test_backbone_utils.py b/test/test_backbone_utils.py index 8434f16e746..04f59242b2b 100644 --- a/test/test_backbone_utils.py +++ b/test/test_backbone_utils.py @@ -150,8 +150,6 @@ def test_node_name_conventions(self): def test_forward_backward(self, model_name): model = models.__dict__[model_name](**self.model_defaults).train() train_return_nodes, eval_return_nodes = self._get_return_nodes(model) - print("train_return_nodes = ", train_return_nodes) - print("eval_return_nodes = ", eval_return_nodes) model = self._create_feature_extractor( model, train_return_nodes=train_return_nodes, eval_return_nodes=eval_return_nodes ) From 568c560e0a97fc7275b12447c6655cc6adcd15b1 Mon Sep 17 00:00:00 2001 From: sallysyw Date: Mon, 15 Nov 2021 21:30:30 +0000 Subject: [PATCH 07/23] reduce input size to fix unittests --- .../expect/ModelTester.test_vit_h_14_expect.pkl | Bin 939 -> 747 bytes test/test_models.py | 8 +++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/test/expect/ModelTester.test_vit_h_14_expect.pkl b/test/expect/ModelTester.test_vit_h_14_expect.pkl index 1f846beb6a0bccf8b545f5a67b74482015cc878b..3f51f8a20237451171af108d3cede4fac8213afb 100644 GIT binary patch delta 156 zcmZ3@{+e~dJOS2{Le@ZU25&}h*0w@6Z`O(H40zOiBHCs#Fff4d1gXg^i~#}x-s~K` zv#;`t0EIw!f|NKvgFZuIQF2CRS*m_nYEf}!ex93?HpApVChf@$OiC;uQzkECvf_m6 loBWc=mnnsDvIVoF&?QEoDL@e5&BhL*JDDbTGfS~SgaKDUC0YOg delta 336 zcmaFOx|)5$JOQJULe@ZU25&}hqqageZ=;Fp3^!iY vl~_PNpZtu`3Q4mjlP{A5)8q;!N1;hfK;wWQz?+R7M0+w%KFuV>1`!4T0RdAR diff --git a/test/test_models.py b/test/test_models.py index 150b813b0cb..59025fdb37e 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -290,6 +290,11 @@ def _check_input_backprop(model, inputs): "rpn_pre_nms_top_n_test": 1000, "rpn_post_nms_top_n_test": 1000, }, + "vit_h_14": { + "num_classes": 5, + "image_size": 28, + "input_shape": (1, 3, 28, 28), + }, } @@ -514,6 +519,7 @@ def test_classification_model(model_fn, dev): } model_name = model_fn.__name__ kwargs = {**defaults, **_model_params.get(model_name, {})} + num_classes = kwargs.get("num_classes") input_shape = kwargs.pop("input_shape") model = model_fn(**kwargs) @@ -522,7 +528,7 @@ def test_classification_model(model_fn, dev): x = torch.rand(input_shape).to(device=dev) out = model(x) _assert_expected(out.cpu(), model_name, prec=0.1) - assert out.shape[-1] == 50 + assert out.shape[-1] == num_classes _check_jit_scriptable(model, (x,), unwrapper=script_model_unwrapper.get(model_name, None)) _check_fx_compatible(model, x) From 8e71e4b7c473bf54d3d89ba04d6d1b3dfd8de0bb Mon Sep 17 00:00:00 2001 From: sallysyw Date: Tue, 16 Nov 2021 18:53:58 +0000 Subject: [PATCH 08/23] Increase windows-cpu executor to 2xlarge --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 3a58071fabf..bab214aed02 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -9,7 +9,7 @@ version: 2.1 executors: windows-cpu: machine: - resource_class: windows.xlarge + resource_class: windows.2xlarge image: windows-server-2019-vs2019:stable shell: bash.exe From f9860ec97440f24b08f5037d7f31b4bf5fbb25ee Mon Sep 17 00:00:00 2001 From: sallysyw Date: Wed, 17 Nov 2021 08:20:15 +0000 Subject: [PATCH 09/23] Use `batch_first=True` and remove classifier --- torchvision/models/vision_transformer.py | 44 +++++++++--------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py index c62592b25d3..332e6034bd0 100644 --- a/torchvision/models/vision_transformer.py +++ b/torchvision/models/vision_transformer.py @@ -57,7 +57,7 @@ def __init__( # Attention block self.ln_1 = norm_layer(hidden_dim) - self.self_attention = nn.MultiheadAttention(hidden_dim, num_heads, dropout=attention_dropout) + self.self_attention = nn.MultiheadAttention(hidden_dim, num_heads, dropout=attention_dropout, batch_first=True) self.dropout = nn.Dropout(dropout) # MLP block @@ -91,9 +91,9 @@ def __init__( norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6), ): super().__init__() - # Note that batch_size is on the second dim because - # we have batch_first=False in nn.MultiAttention() by default - self.pos_embedding = nn.Parameter(torch.empty(seq_length, 1, hidden_dim).normal_(std=0.02)) # from BERT + # Note that batch_size is on the first dim because + # we have batch_first=True in nn.MultiAttention() by default + self.pos_embedding = nn.Parameter(torch.empty(1, seq_length, hidden_dim).normal_(std=0.02)) # from BERT self.dropout = nn.Dropout(dropout) layers: OrderedDict[str, nn.Module] = OrderedDict() for i in range(num_layers): @@ -109,7 +109,7 @@ def __init__( self.ln = norm_layer(hidden_dim) def forward(self, input: Tensor): - torch._assert(input.dim() == 3, f"Expected (seq_length, batch_size, hidden_dim) got {input.shape}") + torch._assert(input.dim() == 3, f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}") input = input + self.pos_embedding return self.ln(self.layers(self.dropout(input))) @@ -127,21 +127,18 @@ def __init__( mlp_dim: int, dropout: float = 0.0, attention_dropout: float = 0.0, - classifier: str = "token", num_classes: int = 1000, representation_size: Optional[int] = None, norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6), ): super().__init__() torch._assert(image_size % patch_size == 0, "Input shape indivisible by patch size!") - torch._assert(classifier in ["token", "gap"], "Unexpected classifier mode!") self.image_size = image_size self.patch_size = patch_size self.hidden_dim = hidden_dim self.mlp_dim = mlp_dim self.attention_dropout = attention_dropout self.dropout = dropout - self.classifier = classifier self.num_classes = num_classes self.representation_size = representation_size self.norm_layer = norm_layer @@ -153,10 +150,10 @@ def __init__( self.conv_proj = nn.Conv2d(input_channels, hidden_dim, kernel_size=patch_size, stride=patch_size) seq_length = (image_size // patch_size) ** 2 - if self.classifier == "token": - # Add a class token - self.class_token = nn.Parameter(torch.zeros(1, 1, hidden_dim)) - seq_length += 1 + + # Add a class token + self.class_token = nn.Parameter(torch.zeros(1, 1, hidden_dim)) + seq_length += 1 self.encoder = Encoder( seq_length, @@ -207,27 +204,20 @@ def forward(self, x: torch.Tensor): # (n, hidden_dim, n_h, n_w) -> (n, hidden_dim, (n_h * n_w)) x = x.reshape(n, self.hidden_dim, n_h * n_w) - # (n, hidden_dim, (n_h * n_w)) -> ((n_h * n_w), n, hidden_dim) - # The self attention layer expects inputs in the format (S, N, E) + # (n, hidden_dim, (n_h * n_w)) -> (n, (n_h * n_w), hidden_dim) + # The self attention layer expects inputs in the format (N, S, E) # where S is the source sequence length, N is the batch size, E is the # embedding dimension - x = x.permute(2, 0, 1) + x = x.permute(0, 2, 1) - if self.classifier == "token": - # Expand the class token to the full batch. - batch_class_token = self.class_token.expand(-1, n, -1) - x = torch.cat([batch_class_token, x], dim=0) + # Expand the class token to the full batch. + batch_class_token = self.class_token.expand(n, -1, -1) + x = torch.cat([batch_class_token, x], dim=1) x = self.encoder(x) - if self.classifier == "token": - # Classifier as used by standard language architectures - x = x[0, :, :] - elif self.classifier == "gap": - # Classifier as used by standard vision architectures - x = x.mean(dim=0) - else: - raise ValueError(f"Invalid classifier={self.classifier}") + # Classifier "token" as used by standard language architectures + x = x[:, 0] x = self.heads(x) From b795e85d2d89d2d4164a93f8a89fcb9e9b0c6f71 Mon Sep 17 00:00:00 2001 From: sallysyw Date: Wed, 17 Nov 2021 08:35:12 +0000 Subject: [PATCH 10/23] Change resource_class back to xlarge --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 5712817841c..4c7bf22dc67 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -9,7 +9,7 @@ version: 2.1 executors: windows-cpu: machine: - resource_class: windows.2xlarge + resource_class: windows.xlarge image: windows-server-2019-vs2019:stable shell: bash.exe From ff64591c4956af7e45b06e8a1150d77d35071c3e Mon Sep 17 00:00:00 2001 From: sallysyw Date: Wed, 17 Nov 2021 19:23:17 +0000 Subject: [PATCH 11/23] Remove vit_h_14 --- .../ModelTester.test_vit_h_14_expect.pkl | Bin 747 -> 0 bytes test/test_models.py | 5 ---- torchvision/models/vision_transformer.py | 22 ------------------ 3 files changed, 27 deletions(-) delete mode 100644 test/expect/ModelTester.test_vit_h_14_expect.pkl diff --git a/test/expect/ModelTester.test_vit_h_14_expect.pkl b/test/expect/ModelTester.test_vit_h_14_expect.pkl deleted file mode 100644 index 3f51f8a20237451171af108d3cede4fac8213afb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 747 zcma)4%}T>S5S}zm8dqqGc=o9EuPOQlMRHleSP_Dk5I0K=hNhcrd-hcH;0q}D8a{`D zFX2--+q5M~DL7#mX7>AL_REe-^*VrM!J4&U2ZEG6L{n~ufegCi$JpsN;jUy)H6{~8 zb^VEq;*9tyzn?^L=u1AzlGLA`5doc!*TDW6B7-3pT23KOoCNYFFA`DnG{p3T?&%oW zcp~c5kixj8Ks9R2gh>sm%|zpwu_TT;lTk9t$Wd~ zr8j%SZpz(_N}F(0Qfl{{{(ePgNpvB>#%rb~zrO#pcJV~{=E*IvSCgka%}|lkKmOLquf u+oh&?vH mlp_dim=4096, **kwargs, ) - - -def vit_h_14(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: - """ - Constructs a ViT_h_14 architecture from - `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. - - Args: - pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. - progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. - """ - return _vision_transformer( - arch="h_14", - pretrained=pretrained, - progress=progress, - patch_size=14, - num_layers=32, - num_heads=16, - hidden_dim=1280, - mlp_dim=5120, - **kwargs, - ) From bd3a7471a0dea9efecd4d17825be963dcefafa43 Mon Sep 17 00:00:00 2001 From: sallysyw Date: Wed, 17 Nov 2021 19:28:30 +0000 Subject: [PATCH 12/23] Remove vit_h_14 from __all__ --- torchvision/models/vision_transformer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py index 5584c1c6909..461cf2c726b 100644 --- a/torchvision/models/vision_transformer.py +++ b/torchvision/models/vision_transformer.py @@ -17,7 +17,6 @@ "vit_b_32", "vit_l_16", "vit_l_32", - "vit_h_14", ] From 8f885922e35cdf3a5e73649342a37482fc0b693b Mon Sep 17 00:00:00 2001 From: sallysyw Date: Fri, 19 Nov 2021 01:26:33 +0000 Subject: [PATCH 13/23] Move vision_transformer.py into prototype --- torchvision/models/__init__.py | 1 - torchvision/prototype/models/__init__.py | 1 + .../models/vision_transformer.py | 66 +++++++++++++++---- 3 files changed, 53 insertions(+), 15 deletions(-) rename torchvision/{ => prototype}/models/vision_transformer.py (82%) diff --git a/torchvision/models/__init__.py b/torchvision/models/__init__.py index b2b3cdf5f3e..516e47feb19 100644 --- a/torchvision/models/__init__.py +++ b/torchvision/models/__init__.py @@ -10,7 +10,6 @@ from .shufflenetv2 import * from .efficientnet import * from .regnet import * -from .vision_transformer import * from . import detection from . import feature_extraction from . import quantization diff --git a/torchvision/prototype/models/__init__.py b/torchvision/prototype/models/__init__.py index 5077b7fd178..f675dc37f25 100644 --- a/torchvision/prototype/models/__init__.py +++ b/torchvision/prototype/models/__init__.py @@ -10,6 +10,7 @@ from .shufflenetv2 import * from .squeezenet import * from .vgg import * +from .vision_transformer import * from . import detection from . import quantization from . import segmentation diff --git a/torchvision/models/vision_transformer.py b/torchvision/prototype/models/vision_transformer.py similarity index 82% rename from torchvision/models/vision_transformer.py rename to torchvision/prototype/models/vision_transformer.py index 461cf2c726b..6d962939e83 100644 --- a/torchvision/models/vision_transformer.py +++ b/torchvision/prototype/models/vision_transformer.py @@ -11,8 +11,14 @@ import torch.nn as nn from torch import Tensor +from ._api import Weights, WeightEntry + __all__ = [ "VisionTransformer", + "VisionTransformer_B_16Weights", + "VisionTransformer_B_32Weights", + "VisionTransformer_L_16Weights", + "VisionTransformer_L_32Weights", "vit_b_16", "vit_b_32", "vit_l_16", @@ -223,27 +229,53 @@ def forward(self, x: torch.Tensor): return x -def _vision_transformer(arch: str, pretrained: bool, progress: bool, **kwargs: Any) -> VisionTransformer: +class VisionTransformer_B_16Weights(Weights): + # If a default model is added here the corresponding changes need to be done in vit_b_16 + pass + + +class VisionTransformer_B_32Weights(Weights): + # If a default model is added here the corresponding changes need to be done in vit_b_32 + pass + + +class VisionTransformer_L_16Weights(Weights): + # If a default model is added here the corresponding changes need to be done in vit_l_16 + pass + + +class VisionTransformer_L_32Weights(Weights): + # If a default model is added here the corresponding changes need to be done in vit_l_32 + pass + + +def _vision_transformer(weights: Optional[Weights], progress: bool, **kwargs: Any) -> VisionTransformer: image_size = kwargs.get("image_size", 224) if "image_size" in kwargs: kwargs.pop("image_size") + model = VisionTransformer(image_size=image_size, **kwargs) - # TODO: Adding pre-trained models + + if weights: + model.load_state_dict(weights.get_state_dict(progress=progress)) + return model -def vit_b_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: +def vit_b_16(weights: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: """ Constructs a ViT_b_16 architecture from `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. Args: - pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + weights (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ + weights = VisionTransformer_B_16Weights.verify(weights) + return _vision_transformer( arch="b_16", - pretrained=pretrained, + weights=weights, progress=progress, patch_size=16, num_layers=12, @@ -254,18 +286,20 @@ def vit_b_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ) -def vit_b_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: +def vit_b_32(weights: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: """ Constructs a ViT_b_32 architecture from `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. Args: - pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + weights (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ + weights = VisionTransformer_B_32Weights.verify(weights) + return _vision_transformer( arch="b_32", - pretrained=pretrained, + weights=weights, progress=progress, patch_size=32, num_layers=12, @@ -276,18 +310,20 @@ def vit_b_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ) -def vit_l_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: +def vit_l_16(weights: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: """ Constructs a ViT_l_16 architecture from `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. Args: - pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + weights (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ + weights = VisionTransformer_L_16Weights.verify(weights) + return _vision_transformer( arch="l_16", - pretrained=pretrained, + weights=weights, progress=progress, patch_size=16, num_layers=24, @@ -298,18 +334,20 @@ def vit_l_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ) -def vit_l_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: +def vit_l_32(weights: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: """ Constructs a ViT_l_32 architecture from `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. Args: - pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + weights (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ + weights = VisionTransformer_L_32Weights.verify(weights) + return _vision_transformer( arch="l_32", - pretrained=pretrained, + weights=weights, progress=progress, patch_size=32, num_layers=24, From 22025ac86da9719a9d40776a124dbc47398f6b7b Mon Sep 17 00:00:00 2001 From: sallysyw Date: Fri, 19 Nov 2021 01:29:33 +0000 Subject: [PATCH 14/23] Fix formatting issue --- torchvision/prototype/models/vision_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/prototype/models/vision_transformer.py b/torchvision/prototype/models/vision_transformer.py index 6d962939e83..a92c0291754 100644 --- a/torchvision/prototype/models/vision_transformer.py +++ b/torchvision/prototype/models/vision_transformer.py @@ -11,7 +11,7 @@ import torch.nn as nn from torch import Tensor -from ._api import Weights, WeightEntry +from ._api import Weights __all__ = [ "VisionTransformer", From 26bc52998148f6266715aff68a73437d48f1ba78 Mon Sep 17 00:00:00 2001 From: sallysyw Date: Fri, 19 Nov 2021 02:05:50 +0000 Subject: [PATCH 15/23] remove arch in builder --- torchvision/prototype/models/vision_transformer.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/torchvision/prototype/models/vision_transformer.py b/torchvision/prototype/models/vision_transformer.py index a92c0291754..3abb1eff829 100644 --- a/torchvision/prototype/models/vision_transformer.py +++ b/torchvision/prototype/models/vision_transformer.py @@ -274,7 +274,6 @@ def vit_b_16(weights: bool = False, progress: bool = True, **kwargs: Any) -> Vis weights = VisionTransformer_B_16Weights.verify(weights) return _vision_transformer( - arch="b_16", weights=weights, progress=progress, patch_size=16, @@ -298,7 +297,6 @@ def vit_b_32(weights: bool = False, progress: bool = True, **kwargs: Any) -> Vis weights = VisionTransformer_B_32Weights.verify(weights) return _vision_transformer( - arch="b_32", weights=weights, progress=progress, patch_size=32, @@ -322,7 +320,6 @@ def vit_l_16(weights: bool = False, progress: bool = True, **kwargs: Any) -> Vis weights = VisionTransformer_L_16Weights.verify(weights) return _vision_transformer( - arch="l_16", weights=weights, progress=progress, patch_size=16, @@ -346,7 +343,6 @@ def vit_l_32(weights: bool = False, progress: bool = True, **kwargs: Any) -> Vis weights = VisionTransformer_L_32Weights.verify(weights) return _vision_transformer( - arch="l_32", weights=weights, progress=progress, patch_size=32, From cc222386ee52c0c62f464bb17e9eba99271f08ee Mon Sep 17 00:00:00 2001 From: sallysyw Date: Fri, 19 Nov 2021 19:23:52 +0000 Subject: [PATCH 16/23] Fix type err in model builder --- .../prototype/models/vision_transformer.py | 53 ++++++++++++++----- 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/torchvision/prototype/models/vision_transformer.py b/torchvision/prototype/models/vision_transformer.py index 3abb1eff829..ead41f823b2 100644 --- a/torchvision/prototype/models/vision_transformer.py +++ b/torchvision/prototype/models/vision_transformer.py @@ -3,6 +3,7 @@ # https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/vision_transformer.py import math +import warnings from collections import OrderedDict from functools import partial from typing import Any, Callable, Optional @@ -262,15 +263,22 @@ def _vision_transformer(weights: Optional[Weights], progress: bool, **kwargs: An return model -def vit_b_16(weights: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: +def vit_b_16( + weights: Optional[VisionTransformer_B_16Weights] = None, progress: bool = True, **kwargs: Any +) -> VisionTransformer: """ - Constructs a ViT_b_16 architecture from + Constructs a vit_b_16 architecture from `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. Args: - weights (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + weights (VisionTransformer_B_16Weights, optional): If not None, returns a model pre-trained on ImageNet. Default: None. progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ + if "pretrained" in kwargs: + warnings.warn("The parameter pretrained is deprecated, please use weights instead.") + if kwargs.pop("pretrained"): + raise ValueError("No checkpoint is available for model type vit_b_16") + weights = VisionTransformer_B_16Weights.verify(weights) return _vision_transformer( @@ -285,15 +293,22 @@ def vit_b_16(weights: bool = False, progress: bool = True, **kwargs: Any) -> Vis ) -def vit_b_32(weights: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: +def vit_b_32( + weights: Optional[VisionTransformer_B_32Weights] = None, progress: bool = True, **kwargs: Any +) -> VisionTransformer: """ - Constructs a ViT_b_32 architecture from + Constructs a vit_b_32 architecture from `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. Args: - weights (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + weights (VisionTransformer_B_32Weights, optional): If not None, returns a model pre-trained on ImageNet. Default: None. progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ + if "pretrained" in kwargs: + warnings.warn("The parameter pretrained is deprecated, please use weights instead.") + if kwargs.pop("pretrained"): + raise ValueError("No checkpoint is available for model type vit_b_32") + weights = VisionTransformer_B_32Weights.verify(weights) return _vision_transformer( @@ -308,15 +323,22 @@ def vit_b_32(weights: bool = False, progress: bool = True, **kwargs: Any) -> Vis ) -def vit_l_16(weights: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: +def vit_l_16( + weights: Optional[VisionTransformer_L_16Weights] = None, progress: bool = True, **kwargs: Any +) -> VisionTransformer: """ - Constructs a ViT_l_16 architecture from + Constructs a vit_l_16 architecture from `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. Args: - weights (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + weights (VisionTransformer_L_16Weights, optional): If not None, returns a model pre-trained on ImageNet. Default: None. progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ + if "pretrained" in kwargs: + warnings.warn("The parameter pretrained is deprecated, please use weights instead.") + if kwargs.pop("pretrained"): + raise ValueError("No checkpoint is available for model type vit_l_16") + weights = VisionTransformer_L_16Weights.verify(weights) return _vision_transformer( @@ -331,15 +353,22 @@ def vit_l_16(weights: bool = False, progress: bool = True, **kwargs: Any) -> Vis ) -def vit_l_32(weights: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: +def vit_l_32( + weights: Optional[VisionTransformer_B_32Weights] = None, progress: bool = True, **kwargs: Any +) -> VisionTransformer: """ - Constructs a ViT_l_32 architecture from + Constructs a vit_l_32 architecture from `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. Args: - weights (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + weights (VisionTransformer_L_16Weights, optional): If not None, returns a model pre-trained on ImageNet. Default: None. progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ + if "pretrained" in kwargs: + warnings.warn("The parameter pretrained is deprecated, please use weights instead.") + if kwargs.pop("pretrained"): + raise ValueError("No checkpoint is available for model type vit_l_32") + weights = VisionTransformer_L_32Weights.verify(weights) return _vision_transformer( From 41edd15ed30556b8fb671a0244ae17b78149a88c Mon Sep 17 00:00:00 2001 From: sallysyw Date: Wed, 24 Nov 2021 05:40:43 +0000 Subject: [PATCH 17/23] address comments and trigger unittests --- test/test_backbone_utils.py | 13 +++++- torchvision/models/__init__.py | 1 + .../prototype/models/vision_transformer.py | 43 +++++++++++++------ 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/test/test_backbone_utils.py b/test/test_backbone_utils.py index 04f59242b2b..61c1fc07c0f 100644 --- a/test/test_backbone_utils.py +++ b/test/test_backbone_utils.py @@ -1,5 +1,6 @@ import random from itertools import chain +from typing import Mapping, Sequence import pytest import torch @@ -98,7 +99,6 @@ def _get_return_nodes(self, model): "eq", "dim", "getattr", - "self_attention", ] train_nodes, eval_nodes = get_graph_node_names( model, tracer_kwargs={"leaf_modules": self.leaf_modules}, suppress_diff_warning=True @@ -154,7 +154,16 @@ def test_forward_backward(self, model_name): model, train_return_nodes=train_return_nodes, eval_return_nodes=eval_return_nodes ) out = model(self.inp) - sum(o.mean() for o in out.values()).backward() + out_agg = 0 + for node_out in out.values(): + if isinstance(node_out, Sequence): + out_agg += sum(o.mean() for o in node_out if o is not None) + elif isinstance(node_out, Mapping): + out_agg += sum(o.mean() for o in node_out.values() if o is not None) + else: + # Assume that the only other alternative at this point is a Tensor + out_agg += node_out.mean() + out_agg.backward() def test_feature_extraction_methods_equivalence(self): model = models.resnet18(**self.model_defaults).eval() diff --git a/torchvision/models/__init__.py b/torchvision/models/__init__.py index 516e47feb19..09ce34c868e 100644 --- a/torchvision/models/__init__.py +++ b/torchvision/models/__init__.py @@ -10,6 +10,7 @@ from .shufflenetv2 import * from .efficientnet import * from .regnet import * +from torchvision.prototype.models.vision_transformer import * from . import detection from . import feature_extraction from . import quantization diff --git a/torchvision/prototype/models/vision_transformer.py b/torchvision/prototype/models/vision_transformer.py index ead41f823b2..4cb66e166f6 100644 --- a/torchvision/prototype/models/vision_transformer.py +++ b/torchvision/prototype/models/vision_transformer.py @@ -250,12 +250,27 @@ class VisionTransformer_L_32Weights(Weights): pass -def _vision_transformer(weights: Optional[Weights], progress: bool, **kwargs: Any) -> VisionTransformer: - image_size = kwargs.get("image_size", 224) - if "image_size" in kwargs: - kwargs.pop("image_size") - - model = VisionTransformer(image_size=image_size, **kwargs) +def _vision_transformer( + patch_size: int, + num_layers: int, + num_heads: int, + hidden_dim: int, + mlp_dim: int, + weights: Optional[Weights], + progress: bool, + **kwargs: Any, +) -> VisionTransformer: + image_size = kwargs.pop("image_size", 224) + + model = VisionTransformer( + image_size=image_size, + patch_size=patch_size, + num_layers=num_layers, + num_heads=num_heads, + hidden_dim=hidden_dim, + mlp_dim=mlp_dim, + **kwargs, + ) if weights: model.load_state_dict(weights.get_state_dict(progress=progress)) @@ -282,13 +297,13 @@ def vit_b_16( weights = VisionTransformer_B_16Weights.verify(weights) return _vision_transformer( - weights=weights, - progress=progress, patch_size=16, num_layers=12, num_heads=12, hidden_dim=768, mlp_dim=3072, + weights=weights, + progress=progress, **kwargs, ) @@ -312,13 +327,13 @@ def vit_b_32( weights = VisionTransformer_B_32Weights.verify(weights) return _vision_transformer( - weights=weights, - progress=progress, patch_size=32, num_layers=12, num_heads=12, hidden_dim=768, mlp_dim=3072, + weights=weights, + progress=progress, **kwargs, ) @@ -342,13 +357,13 @@ def vit_l_16( weights = VisionTransformer_L_16Weights.verify(weights) return _vision_transformer( - weights=weights, - progress=progress, patch_size=16, num_layers=24, num_heads=16, hidden_dim=1024, mlp_dim=4096, + weights=weights, + progress=progress, **kwargs, ) @@ -372,12 +387,12 @@ def vit_l_32( weights = VisionTransformer_L_32Weights.verify(weights) return _vision_transformer( - weights=weights, - progress=progress, patch_size=32, num_layers=24, num_heads=16, hidden_dim=1024, mlp_dim=4096, + weights=weights, + progress=progress, **kwargs, ) From 48ce69e99d42b16a686cbfeb24a53ec6bfe1753b Mon Sep 17 00:00:00 2001 From: sallysyw Date: Wed, 24 Nov 2021 06:12:06 +0000 Subject: [PATCH 18/23] remove the prototype import in torchvision.models --- torchvision/models/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchvision/models/__init__.py b/torchvision/models/__init__.py index 09ce34c868e..516e47feb19 100644 --- a/torchvision/models/__init__.py +++ b/torchvision/models/__init__.py @@ -10,7 +10,6 @@ from .shufflenetv2 import * from .efficientnet import * from .regnet import * -from torchvision.prototype.models.vision_transformer import * from . import detection from . import feature_extraction from . import quantization From 3a6b4453db5868792b1b08a0d49991aa89cbf17d Mon Sep 17 00:00:00 2001 From: sallysyw Date: Wed, 24 Nov 2021 19:03:12 +0000 Subject: [PATCH 19/23] Adding vit back to models to trigger CircleCI test --- torchvision/models/__init__.py | 1 + torchvision/models/vision_transformer.py | 333 +++++++++++++++++++++++ 2 files changed, 334 insertions(+) create mode 100644 torchvision/models/vision_transformer.py diff --git a/torchvision/models/__init__.py b/torchvision/models/__init__.py index 516e47feb19..b2b3cdf5f3e 100644 --- a/torchvision/models/__init__.py +++ b/torchvision/models/__init__.py @@ -10,6 +10,7 @@ from .shufflenetv2 import * from .efficientnet import * from .regnet import * +from .vision_transformer import * from . import detection from . import feature_extraction from . import quantization diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py new file mode 100644 index 00000000000..6e27d4f3a9a --- /dev/null +++ b/torchvision/models/vision_transformer.py @@ -0,0 +1,333 @@ +# References: +# https://github.com/google-research/vision_transformer +# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/vision_transformer.py + + +import math +from collections import OrderedDict +from functools import partial +from typing import Any, Callable, Optional + +import torch +import torch.nn as nn +from torch import Tensor + + +__all__ = [ + "VisionTransformer", + "vit_b_16", + "vit_b_32", + "vit_l_16", + "vit_l_32", +] + + +class MLPBlock(nn.Sequential): + """Transformer MLP block.""" + + def __init__(self, in_dim: int, mlp_dim: int, dropout: float): + super().__init__() + self.linear_1 = nn.Linear(in_dim, mlp_dim) + self.act = nn.GELU() + self.dropout_1 = nn.Dropout(dropout) + self.linear_2 = nn.Linear(mlp_dim, in_dim) + self.dropout_2 = nn.Dropout(dropout) + self._init_weights() + + def _init_weights(self): + nn.init.xavier_uniform_(self.linear_1.weight) + nn.init.xavier_uniform_(self.linear_2.weight) + nn.init.normal_(self.linear_1.bias, std=1e-6) + nn.init.normal_(self.linear_2.bias, std=1e-6) + + +class EncoderBlock(nn.Module): + """Transformer encoder block.""" + + def __init__( + self, + num_heads: int, + hidden_dim: int, + mlp_dim: int, + dropout: float, + attention_dropout: float, + norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6), + ): + super().__init__() + self.num_heads = num_heads + + # Attention block + self.ln_1 = norm_layer(hidden_dim) + self.self_attention = nn.MultiheadAttention(hidden_dim, num_heads, dropout=attention_dropout, batch_first=True) + self.dropout = nn.Dropout(dropout) + + # MLP block + self.ln_2 = norm_layer(hidden_dim) + self.mlp = MLPBlock(hidden_dim, mlp_dim, dropout) + + def forward(self, input: Tensor): + torch._assert(input.dim() == 3, f"Expected (seq_length, batch_size, hidden_dim) got {input.shape}") + x = self.ln_1(input) + x, _ = self.self_attention(query=x, key=x, value=x, need_weights=False) + x = self.dropout(x) + x = x + input + + y = self.ln_2(x) + y = self.mlp(y) + return x + y + + +class Encoder(nn.Module): + """Transformer Model Encoder for sequence to sequence translation.""" + + def __init__( + self, + seq_length: int, + num_layers: int, + num_heads: int, + hidden_dim: int, + mlp_dim: int, + dropout: float, + attention_dropout: float, + norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6), + ): + super().__init__() + # Note that batch_size is on the first dim because + # we have batch_first=True in nn.MultiAttention() by default + self.pos_embedding = nn.Parameter(torch.empty(1, seq_length, hidden_dim).normal_(std=0.02)) # from BERT + self.dropout = nn.Dropout(dropout) + layers: OrderedDict[str, nn.Module] = OrderedDict() + for i in range(num_layers): + layers[f"encoder_layer_{i}"] = EncoderBlock( + num_heads, + hidden_dim, + mlp_dim, + dropout, + attention_dropout, + norm_layer, + ) + self.layers = nn.Sequential(layers) + self.ln = norm_layer(hidden_dim) + + def forward(self, input: Tensor): + torch._assert(input.dim() == 3, f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}") + input = input + self.pos_embedding + return self.ln(self.layers(self.dropout(input))) + + +class VisionTransformer(nn.Module): + """Vision Transformer as per https://arxiv.org/abs/2010.11929.""" + + def __init__( + self, + image_size: int, + patch_size: int, + num_layers: int, + num_heads: int, + hidden_dim: int, + mlp_dim: int, + dropout: float = 0.0, + attention_dropout: float = 0.0, + num_classes: int = 1000, + representation_size: Optional[int] = None, + norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6), + ): + super().__init__() + torch._assert(image_size % patch_size == 0, "Input shape indivisible by patch size!") + self.image_size = image_size + self.patch_size = patch_size + self.hidden_dim = hidden_dim + self.mlp_dim = mlp_dim + self.attention_dropout = attention_dropout + self.dropout = dropout + self.num_classes = num_classes + self.representation_size = representation_size + self.norm_layer = norm_layer + + input_channels = 3 + + # The conv_proj is a more efficient version of reshaping, permuting + # and projecting the input + self.conv_proj = nn.Conv2d(input_channels, hidden_dim, kernel_size=patch_size, stride=patch_size) + + seq_length = (image_size // patch_size) ** 2 + + # Add a class token + self.class_token = nn.Parameter(torch.zeros(1, 1, hidden_dim)) + seq_length += 1 + + self.encoder = Encoder( + seq_length, + num_layers, + num_heads, + hidden_dim, + mlp_dim, + dropout, + attention_dropout, + norm_layer, + ) + self.seq_length = seq_length + + heads_layers: OrderedDict[str, nn.Module] = OrderedDict() + if representation_size is None: + heads_layers["head"] = nn.Linear(hidden_dim, num_classes) + else: + heads_layers["pre_logits"] = nn.Linear(hidden_dim, representation_size) + heads_layers["act"] = nn.Tanh() + heads_layers["head"] = nn.Linear(representation_size, num_classes) + + self.heads = nn.Sequential(heads_layers) + self._init_weights() + + def _init_weights(self): + fan_in = self.conv_proj.in_channels * self.conv_proj.kernel_size[0] * self.conv_proj.kernel_size[1] + nn.init.trunc_normal_(self.conv_proj.weight, std=math.sqrt(1 / fan_in)) + nn.init.zeros_(self.conv_proj.bias) + + if hasattr(self.heads, "pre_logits"): + fan_in = self.heads.pre_logits.in_features + nn.init.trunc_normal_(self.heads.pre_logits.weight, std=math.sqrt(1 / fan_in)) + nn.init.zeros_(self.heads.pre_logits.bias) + + nn.init.zeros_(self.heads.head.weight) + nn.init.zeros_(self.heads.head.bias) + + def forward(self, x: torch.Tensor): + n, c, h, w = x.shape + p = self.patch_size + torch._assert(h == self.image_size, "Wrong image height!") + torch._assert(w == self.image_size, "Wrong image width!") + n_h = h // p + n_w = w // p + + # (n, c, h, w) -> (n, hidden_dim, n_h, n_w) + x = self.conv_proj(x) + # (n, hidden_dim, n_h, n_w) -> (n, hidden_dim, (n_h * n_w)) + x = x.reshape(n, self.hidden_dim, n_h * n_w) + + # (n, hidden_dim, (n_h * n_w)) -> (n, (n_h * n_w), hidden_dim) + # The self attention layer expects inputs in the format (N, S, E) + # where S is the source sequence length, N is the batch size, E is the + # embedding dimension + x = x.permute(0, 2, 1) + + # Expand the class token to the full batch. + batch_class_token = self.class_token.expand(n, -1, -1) + x = torch.cat([batch_class_token, x], dim=1) + + x = self.encoder(x) + + # Classifier "token" as used by standard language architectures + x = x[:, 0] + + x = self.heads(x) + + return x + + +def _vision_transformer( + patch_size: int, + num_layers: int, + num_heads: int, + hidden_dim: int, + mlp_dim: int, + pretrained: bool, + progress: bool, + **kwargs: Any, +) -> VisionTransformer: + image_size = kwargs.pop("image_size", 224) + + model = VisionTransformer( + image_size=image_size, + patch_size=patch_size, + num_layers=num_layers, + num_heads=num_heads, + hidden_dim=hidden_dim, + mlp_dim=mlp_dim, + **kwargs, + ) + + if pretrained: + raise Exception("Weights not available") # TODO: Adding pre-trained models + + return model + + +def vit_b_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: + """ + Constructs a ViT_b_16 architecture from + `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. + Args: + pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. + """ + return _vision_transformer( + patch_size=16, + num_layers=12, + num_heads=12, + hidden_dim=768, + mlp_dim=3072, + pretrained=pretrained, + progress=progress, + **kwargs, + ) + + +def vit_b_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: + """ + Constructs a ViT_b_32 architecture from + `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. + Args: + pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. + """ + return _vision_transformer( + patch_size=32, + num_layers=12, + num_heads=12, + hidden_dim=768, + mlp_dim=3072, + pretrained=pretrained, + progress=progress, + **kwargs, + ) + + +def vit_l_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: + """ + Constructs a ViT_l_16 architecture from + `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. + Args: + pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. + """ + return _vision_transformer( + patch_size=16, + num_layers=24, + num_heads=16, + hidden_dim=1024, + mlp_dim=4096, + pretrained=pretrained, + progress=progress, + **kwargs, + ) + + +def vit_l_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: + """ + Constructs a ViT_l_32 architecture from + `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. + Args: + pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. + progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. + """ + return _vision_transformer( + patch_size=32, + num_layers=24, + num_heads=16, + hidden_dim=1024, + mlp_dim=4096, + pretrained=pretrained, + progress=progress, + **kwargs, + ) From 72c5af7476fd50b3d9c6dd31387351eb562f80be Mon Sep 17 00:00:00 2001 From: sallysyw Date: Wed, 24 Nov 2021 19:06:04 +0000 Subject: [PATCH 20/23] fix test_jit_forward_backward --- test/test_backbone_utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/test/test_backbone_utils.py b/test/test_backbone_utils.py index 61c1fc07c0f..4a375b0036a 100644 --- a/test/test_backbone_utils.py +++ b/test/test_backbone_utils.py @@ -195,7 +195,16 @@ def test_jit_forward_backward(self, model_name): ) model = torch.jit.script(model) fgn_out = model(self.inp) - sum(o.mean() for o in fgn_out.values()).backward() + out_agg = 0 + for node_out in fgn_out.values(): + if isinstance(node_out, Sequence): + out_agg += sum(o.mean() for o in node_out if o is not None) + elif isinstance(node_out, Mapping): + out_agg += sum(o.mean() for o in node_out.values() if o is not None) + else: + # Assume that the only other alternative at this point is a Tensor + out_agg += node_out.mean() + out_agg.backward() def test_train_eval(self): class TestModel(torch.nn.Module): From aae308c9ac4654b367320a36d2880d984b819491 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 25 Nov 2021 16:01:47 +0000 Subject: [PATCH 21/23] Move all to prototype. --- torchvision/models/__init__.py | 1 - torchvision/models/vision_transformer.py | 333 ----------------------- 2 files changed, 334 deletions(-) delete mode 100644 torchvision/models/vision_transformer.py diff --git a/torchvision/models/__init__.py b/torchvision/models/__init__.py index b2b3cdf5f3e..516e47feb19 100644 --- a/torchvision/models/__init__.py +++ b/torchvision/models/__init__.py @@ -10,7 +10,6 @@ from .shufflenetv2 import * from .efficientnet import * from .regnet import * -from .vision_transformer import * from . import detection from . import feature_extraction from . import quantization diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py deleted file mode 100644 index 6e27d4f3a9a..00000000000 --- a/torchvision/models/vision_transformer.py +++ /dev/null @@ -1,333 +0,0 @@ -# References: -# https://github.com/google-research/vision_transformer -# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/vision_transformer.py - - -import math -from collections import OrderedDict -from functools import partial -from typing import Any, Callable, Optional - -import torch -import torch.nn as nn -from torch import Tensor - - -__all__ = [ - "VisionTransformer", - "vit_b_16", - "vit_b_32", - "vit_l_16", - "vit_l_32", -] - - -class MLPBlock(nn.Sequential): - """Transformer MLP block.""" - - def __init__(self, in_dim: int, mlp_dim: int, dropout: float): - super().__init__() - self.linear_1 = nn.Linear(in_dim, mlp_dim) - self.act = nn.GELU() - self.dropout_1 = nn.Dropout(dropout) - self.linear_2 = nn.Linear(mlp_dim, in_dim) - self.dropout_2 = nn.Dropout(dropout) - self._init_weights() - - def _init_weights(self): - nn.init.xavier_uniform_(self.linear_1.weight) - nn.init.xavier_uniform_(self.linear_2.weight) - nn.init.normal_(self.linear_1.bias, std=1e-6) - nn.init.normal_(self.linear_2.bias, std=1e-6) - - -class EncoderBlock(nn.Module): - """Transformer encoder block.""" - - def __init__( - self, - num_heads: int, - hidden_dim: int, - mlp_dim: int, - dropout: float, - attention_dropout: float, - norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6), - ): - super().__init__() - self.num_heads = num_heads - - # Attention block - self.ln_1 = norm_layer(hidden_dim) - self.self_attention = nn.MultiheadAttention(hidden_dim, num_heads, dropout=attention_dropout, batch_first=True) - self.dropout = nn.Dropout(dropout) - - # MLP block - self.ln_2 = norm_layer(hidden_dim) - self.mlp = MLPBlock(hidden_dim, mlp_dim, dropout) - - def forward(self, input: Tensor): - torch._assert(input.dim() == 3, f"Expected (seq_length, batch_size, hidden_dim) got {input.shape}") - x = self.ln_1(input) - x, _ = self.self_attention(query=x, key=x, value=x, need_weights=False) - x = self.dropout(x) - x = x + input - - y = self.ln_2(x) - y = self.mlp(y) - return x + y - - -class Encoder(nn.Module): - """Transformer Model Encoder for sequence to sequence translation.""" - - def __init__( - self, - seq_length: int, - num_layers: int, - num_heads: int, - hidden_dim: int, - mlp_dim: int, - dropout: float, - attention_dropout: float, - norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6), - ): - super().__init__() - # Note that batch_size is on the first dim because - # we have batch_first=True in nn.MultiAttention() by default - self.pos_embedding = nn.Parameter(torch.empty(1, seq_length, hidden_dim).normal_(std=0.02)) # from BERT - self.dropout = nn.Dropout(dropout) - layers: OrderedDict[str, nn.Module] = OrderedDict() - for i in range(num_layers): - layers[f"encoder_layer_{i}"] = EncoderBlock( - num_heads, - hidden_dim, - mlp_dim, - dropout, - attention_dropout, - norm_layer, - ) - self.layers = nn.Sequential(layers) - self.ln = norm_layer(hidden_dim) - - def forward(self, input: Tensor): - torch._assert(input.dim() == 3, f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}") - input = input + self.pos_embedding - return self.ln(self.layers(self.dropout(input))) - - -class VisionTransformer(nn.Module): - """Vision Transformer as per https://arxiv.org/abs/2010.11929.""" - - def __init__( - self, - image_size: int, - patch_size: int, - num_layers: int, - num_heads: int, - hidden_dim: int, - mlp_dim: int, - dropout: float = 0.0, - attention_dropout: float = 0.0, - num_classes: int = 1000, - representation_size: Optional[int] = None, - norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6), - ): - super().__init__() - torch._assert(image_size % patch_size == 0, "Input shape indivisible by patch size!") - self.image_size = image_size - self.patch_size = patch_size - self.hidden_dim = hidden_dim - self.mlp_dim = mlp_dim - self.attention_dropout = attention_dropout - self.dropout = dropout - self.num_classes = num_classes - self.representation_size = representation_size - self.norm_layer = norm_layer - - input_channels = 3 - - # The conv_proj is a more efficient version of reshaping, permuting - # and projecting the input - self.conv_proj = nn.Conv2d(input_channels, hidden_dim, kernel_size=patch_size, stride=patch_size) - - seq_length = (image_size // patch_size) ** 2 - - # Add a class token - self.class_token = nn.Parameter(torch.zeros(1, 1, hidden_dim)) - seq_length += 1 - - self.encoder = Encoder( - seq_length, - num_layers, - num_heads, - hidden_dim, - mlp_dim, - dropout, - attention_dropout, - norm_layer, - ) - self.seq_length = seq_length - - heads_layers: OrderedDict[str, nn.Module] = OrderedDict() - if representation_size is None: - heads_layers["head"] = nn.Linear(hidden_dim, num_classes) - else: - heads_layers["pre_logits"] = nn.Linear(hidden_dim, representation_size) - heads_layers["act"] = nn.Tanh() - heads_layers["head"] = nn.Linear(representation_size, num_classes) - - self.heads = nn.Sequential(heads_layers) - self._init_weights() - - def _init_weights(self): - fan_in = self.conv_proj.in_channels * self.conv_proj.kernel_size[0] * self.conv_proj.kernel_size[1] - nn.init.trunc_normal_(self.conv_proj.weight, std=math.sqrt(1 / fan_in)) - nn.init.zeros_(self.conv_proj.bias) - - if hasattr(self.heads, "pre_logits"): - fan_in = self.heads.pre_logits.in_features - nn.init.trunc_normal_(self.heads.pre_logits.weight, std=math.sqrt(1 / fan_in)) - nn.init.zeros_(self.heads.pre_logits.bias) - - nn.init.zeros_(self.heads.head.weight) - nn.init.zeros_(self.heads.head.bias) - - def forward(self, x: torch.Tensor): - n, c, h, w = x.shape - p = self.patch_size - torch._assert(h == self.image_size, "Wrong image height!") - torch._assert(w == self.image_size, "Wrong image width!") - n_h = h // p - n_w = w // p - - # (n, c, h, w) -> (n, hidden_dim, n_h, n_w) - x = self.conv_proj(x) - # (n, hidden_dim, n_h, n_w) -> (n, hidden_dim, (n_h * n_w)) - x = x.reshape(n, self.hidden_dim, n_h * n_w) - - # (n, hidden_dim, (n_h * n_w)) -> (n, (n_h * n_w), hidden_dim) - # The self attention layer expects inputs in the format (N, S, E) - # where S is the source sequence length, N is the batch size, E is the - # embedding dimension - x = x.permute(0, 2, 1) - - # Expand the class token to the full batch. - batch_class_token = self.class_token.expand(n, -1, -1) - x = torch.cat([batch_class_token, x], dim=1) - - x = self.encoder(x) - - # Classifier "token" as used by standard language architectures - x = x[:, 0] - - x = self.heads(x) - - return x - - -def _vision_transformer( - patch_size: int, - num_layers: int, - num_heads: int, - hidden_dim: int, - mlp_dim: int, - pretrained: bool, - progress: bool, - **kwargs: Any, -) -> VisionTransformer: - image_size = kwargs.pop("image_size", 224) - - model = VisionTransformer( - image_size=image_size, - patch_size=patch_size, - num_layers=num_layers, - num_heads=num_heads, - hidden_dim=hidden_dim, - mlp_dim=mlp_dim, - **kwargs, - ) - - if pretrained: - raise Exception("Weights not available") # TODO: Adding pre-trained models - - return model - - -def vit_b_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: - """ - Constructs a ViT_b_16 architecture from - `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. - Args: - pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. - progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. - """ - return _vision_transformer( - patch_size=16, - num_layers=12, - num_heads=12, - hidden_dim=768, - mlp_dim=3072, - pretrained=pretrained, - progress=progress, - **kwargs, - ) - - -def vit_b_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: - """ - Constructs a ViT_b_32 architecture from - `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. - Args: - pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. - progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. - """ - return _vision_transformer( - patch_size=32, - num_layers=12, - num_heads=12, - hidden_dim=768, - mlp_dim=3072, - pretrained=pretrained, - progress=progress, - **kwargs, - ) - - -def vit_l_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: - """ - Constructs a ViT_l_16 architecture from - `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. - Args: - pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. - progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. - """ - return _vision_transformer( - patch_size=16, - num_layers=24, - num_heads=16, - hidden_dim=1024, - mlp_dim=4096, - pretrained=pretrained, - progress=progress, - **kwargs, - ) - - -def vit_l_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: - """ - Constructs a ViT_l_32 architecture from - `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. - Args: - pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False. - progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. - """ - return _vision_transformer( - patch_size=32, - num_layers=24, - num_heads=16, - hidden_dim=1024, - mlp_dim=4096, - pretrained=pretrained, - progress=progress, - **kwargs, - ) From f0df7f8b80d43e2b59d1a3c31c08062327b34f4e Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 25 Nov 2021 17:15:53 +0000 Subject: [PATCH 22/23] Adopt new helper methods and fix prototype tests. --- test/test_prototype_models.py | 7 +++- .../prototype/models/vision_transformer.py | 42 ++++++++++--------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/test/test_prototype_models.py b/test/test_prototype_models.py index 24fe7d21a7c..5231676111e 100644 --- a/test/test_prototype_models.py +++ b/test/test_prototype_models.py @@ -118,8 +118,11 @@ def test_old_vs_new_factory(model_fn, module_name, dev): x = [x] # compare with new model builder parameterized in the old fashion way - model_old = _build_model(_get_original_model(model_fn), **kwargs).to(device=dev) - model_new = _build_model(model_fn, **kwargs).to(device=dev) + try: + model_old = _build_model(_get_original_model(model_fn), **kwargs).to(device=dev) + model_new = _build_model(model_fn, **kwargs).to(device=dev) + except ModuleNotFoundError: + pytest.skip(f"Model '{model_name}' not available in both modules.") torch.testing.assert_close(model_new(x), model_old(x), rtol=0.0, atol=0.0, check_dtype=False) diff --git a/torchvision/prototype/models/vision_transformer.py b/torchvision/prototype/models/vision_transformer.py index 4cb66e166f6..08e805e36b0 100644 --- a/torchvision/prototype/models/vision_transformer.py +++ b/torchvision/prototype/models/vision_transformer.py @@ -13,6 +13,8 @@ from torch import Tensor from ._api import Weights +from ._utils import _deprecated_param, _deprecated_positional + __all__ = [ "VisionTransformer", @@ -286,14 +288,14 @@ def vit_b_16( `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. Args: - weights (VisionTransformer_B_16Weights, optional): If not None, returns a model pre-trained on ImageNet. Default: None. + weights (VisionTransformer_B_16Weights, optional): If not None, returns a model pre-trained on ImageNet. + Default: None. progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ + if type(weights) == bool and weights: + _deprecated_positional(kwargs, "pretrained", "weights", True) if "pretrained" in kwargs: - warnings.warn("The parameter pretrained is deprecated, please use weights instead.") - if kwargs.pop("pretrained"): - raise ValueError("No checkpoint is available for model type vit_b_16") - + weights = _deprecated_param(kwargs, "pretrained", "weights", None) weights = VisionTransformer_B_16Weights.verify(weights) return _vision_transformer( @@ -316,14 +318,14 @@ def vit_b_32( `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. Args: - weights (VisionTransformer_B_32Weights, optional): If not None, returns a model pre-trained on ImageNet. Default: None. + weights (VisionTransformer_B_32Weights, optional): If not None, returns a model pre-trained on ImageNet. + Default: None. progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ + if type(weights) == bool and weights: + _deprecated_positional(kwargs, "pretrained", "weights", True) if "pretrained" in kwargs: - warnings.warn("The parameter pretrained is deprecated, please use weights instead.") - if kwargs.pop("pretrained"): - raise ValueError("No checkpoint is available for model type vit_b_32") - + weights = _deprecated_param(kwargs, "pretrained", "weights", None) weights = VisionTransformer_B_32Weights.verify(weights) return _vision_transformer( @@ -346,14 +348,14 @@ def vit_l_16( `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. Args: - weights (VisionTransformer_L_16Weights, optional): If not None, returns a model pre-trained on ImageNet. Default: None. + weights (VisionTransformer_L_16Weights, optional): If not None, returns a model pre-trained on ImageNet. + Default: None. progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ + if type(weights) == bool and weights: + _deprecated_positional(kwargs, "pretrained", "weights", True) if "pretrained" in kwargs: - warnings.warn("The parameter pretrained is deprecated, please use weights instead.") - if kwargs.pop("pretrained"): - raise ValueError("No checkpoint is available for model type vit_l_16") - + weights = _deprecated_param(kwargs, "pretrained", "weights", None) weights = VisionTransformer_L_16Weights.verify(weights) return _vision_transformer( @@ -376,14 +378,14 @@ def vit_l_32( `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. Args: - weights (VisionTransformer_L_16Weights, optional): If not None, returns a model pre-trained on ImageNet. Default: None. + weights (VisionTransformer_L_16Weights, optional): If not None, returns a model pre-trained on ImageNet. + Default: None. progress (bool, optional): If True, displays a progress bar of the download to stderr. Default: True. """ + if type(weights) == bool and weights: + _deprecated_positional(kwargs, "pretrained", "weights", True) if "pretrained" in kwargs: - warnings.warn("The parameter pretrained is deprecated, please use weights instead.") - if kwargs.pop("pretrained"): - raise ValueError("No checkpoint is available for model type vit_l_32") - + weights = _deprecated_param(kwargs, "pretrained", "weights", None) weights = VisionTransformer_L_32Weights.verify(weights) return _vision_transformer( From 3807b2340ae312af09e6ac63a1e25e72cec6f40d Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 25 Nov 2021 17:19:18 +0000 Subject: [PATCH 23/23] Remove unused import. --- torchvision/prototype/models/vision_transformer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchvision/prototype/models/vision_transformer.py b/torchvision/prototype/models/vision_transformer.py index 08e805e36b0..987f3af1bb4 100644 --- a/torchvision/prototype/models/vision_transformer.py +++ b/torchvision/prototype/models/vision_transformer.py @@ -3,7 +3,6 @@ # https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/vision_transformer.py import math -import warnings from collections import OrderedDict from functools import partial from typing import Any, Callable, Optional