diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index bf04ce2d..822d98cc 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -9,9 +9,9 @@ on: jobs: build: runs-on: ubuntu-latest - + steps: - + - uses: actions/cache@v2 if: startsWith(runner.os, 'Linux') with: @@ -19,14 +19,14 @@ jobs: key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} restore-keys: | ${{ runner.os }}-pip- - + - uses: actions/checkout@v2 - name: Set up Python 3.7 uses: actions/setup-python@v2 with: python-version: 3.7 - + - name: Install dependencies run: | python -m pip install --upgrade pip @@ -36,10 +36,11 @@ jobs: - name: Code coverage test run: pytest --cov-report xml --cov='./vformer/' --cov-config=.coveragerc + - name: Generate report using codecov uses: codecov/codecov-action@v1 if: always() with: fail_ci_if_error: false file: coverage.xml - env_vars: OS,PYTHON \ No newline at end of file + env_vars: OS,PYTHON diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 0950b178..3ac621de 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -32,4 +32,11 @@ jobs: - name: Run pre-commit hooks run: | pre-commit install - pre-commit run -a \ No newline at end of file + pre-commit run -a + + - name: .RST Code Format check + run: | + pip install rstcheck + cd docs + cd Tutorial + rstcheck Tutorial.rst diff --git a/docs/Tutorial/Tutorial.rst b/docs/Tutorial/Tutorial.rst new file mode 100644 index 00000000..6bd750b8 --- /dev/null +++ b/docs/Tutorial/Tutorial.rst @@ -0,0 +1,150 @@ +============================================ +Building Swin Transformer Model with VFormer +============================================ + +In this tutorial, we will guide you to build Swin Transformer with the building blocks available in the library. + +In general, many of vision transformers have 3 main building blocks, +1) Patch Embedding +2) Encoder +3) Decoder + +Now let's look at Architecture of Swin Transformer + +.. image:: ./images/Swin Transformer Architecture.jpg + + + + +So first step is that image is partitioned into patches and then projected into an embedding. + +These patches can be overlapping or can be non-overlapping; There are different methods to extract these embeddings. Please see the embedding module for more insight. + +.. code-block:: python + + import torch.nn as nn + + from vformer.encoder.embedding import PatchEmbedding + + patch_embedding = PatchEmbedding( + image_size = 224,#Insert size of an Image, + patch_size = 4,#Insert size of a single patch, make sure that image_size is divisible by patch_size , + in_channels = 3,#Insert number of input channels, for rgb image this value is 3; for grayscale this value is 1 + embedding_dim = 96,#Insert number of dimensions of the embedding, every patch will be projected into the embedding space having `embedding_dim` dimensions + norm_layer = nn.LayerNorm #Normalisation layer object + ) + + + + +These embeddings are then encoded with Swin Encoder block. Swin Encoder block consists of Multi-Head-Self-Attention(MHSA) followed by Multi Layer Perceptron(MLP). I am not going in detail how the encoder is implemented; + +importing swin Encoder + +.. code-block:: python + + import torch.nn as nn + + from vformer.encoder import SwinEncoder + from vformer.functional import PatchMerging + + depths=[2,2,6,2] + num_heads=[3,6,12,24] + input_resolution = patch_embedding.input_resolution + num_stages,embedding_dim,window_size=(4,96,7) + + for i in range(num_stages): + swin_encoder.append(SwinEncoder( + dim= embedding_dim * 2**i ,#Dimension of the embedding at ith stage, + input_resolution= ( + (patch_resolution[0] // (2 ** i)), + patch_resolution[1] // (2 ** i),), #Resolution of patches at ith stage, + depth=depths[i], + num_heads=num_heads[i],#Number of Attention heads at ith stage , + window_size=window_size,#Insert window size, refert to window-self attention for more insight + norm_layer=nn.LayerNorm,#Normalisation layer object, + downsample = PatchMerging, # in the last stage nn.Identity should be used + )) + #This swin_encoder ModuleList contains all 4 stages and patchmerging blocks + +This encodeded tensors are then passed through Decoder for classification + +.. code-block:: python + + from vformer.decoder import MLPDecoder + + decoder = MLPDecoder(config =[768,256,32], #List of decoding dimensions, + n_classes =10) #Number of classes + +Now putting it all together + +.. code-block:: python + + import torch + import torch.nn as nn + + from vformer.encoder import SwinEncoder + from vformer.encoder.embedding import PatchEmbedding + from vformer.decoder import MLPDecoder + from vformer.functional import PatchMerging + + class SwinTransformer(nn.Module): + def __init__(self, + img_size=224, patch_size=4, in_channels=3, + n_classes=10, embedding_dim=96, depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], window_size=7, + mlp_ratio=4.0, norm_layer=nn.LayerNorm, + decoder_config=[768,256,32,10], patch_norm=True,): + super().__init__() + self.patch_embed = PatchEmbedding( + img_size=img_size, + patch_size=patch_size, + in_channels=in_channels, + embedding_dim=embedding_dim, + norm_layer=norm_layer, + ) + self.patch_resolution = self.patch_embed.patch_resolution + + self.encoder = nn.ModuleList() + + for i_layer in range(len(depths)): + layer = SwinEncoder( + dim=int(embedding_dim * (2 ** i_layer)), + input_resolution=( + (self.patch_resolution[0] // (2 ** i_layer)), + self.patch_resolution[1] // (2 ** i_layer),), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=mlp_ratio, + norm_layer=norm_layer, + downsample=PatchMerging if i_layer < len(depths) - 1 else None, + ) + self.encoder.append(layer) + + self.pool = nn.AdaptiveAvgPool1d(1) + self.decoder = MLPDecoder(config=decoder_config,n_classes=n_classes) + def forward(self,x): + #forward pass + + x = self.patch_embed(x) + for layer in self.encoder: + x=layer(x) + + x = self.pool(x.transpose(1, 2)).flatten(1) + return self.decoder(x) + + + model = SwinTransformer() + +Some popular Vision Transformer models are already implemented in VFormer, you can use them directly from `vformer/models` directory + +for eg. + +.. code-block:: python + + from vformer.models import SwinTransformer + + model = SwinTransformer(img_size = 224,patch_size=4,in_channels=3,window_size=7,n_classes=10) + + diff --git a/docs/Tutorial/images/SwinTransformerArchitecture.jpg b/docs/Tutorial/images/SwinTransformerArchitecture.jpg new file mode 100644 index 00000000..f1cf0f28 Binary files /dev/null and b/docs/Tutorial/images/SwinTransformerArchitecture.jpg differ