From 759f2246dbfc7cd328c278f3c8c0afde3aedaa03 Mon Sep 17 00:00:00 2001
From: Mo Kweon <kkweon@google.com>
Date: Wed, 14 Jul 2021 12:56:43 -0700
Subject: [PATCH 1/2] feat: add PR-330

---
 server/internal/data/mapping_table.pbtxt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/server/internal/data/mapping_table.pbtxt b/server/internal/data/mapping_table.pbtxt
index 0cef2cd0..9231acbb 100644
--- a/server/internal/data/mapping_table.pbtxt
+++ b/server/internal/data/mapping_table.pbtxt
@@ -2444,3 +2444,10 @@ rows: {
   paper_arxiv_ids: "2105.01601"
   youtube_video_id: "NicKVB-rpc8"
 }
+rows: {
+  pr_id: 330
+  paper_arxiv_ids: "2106.10270"
+  paper_arxiv_ids: "2106.10270"
+  paper_arxiv_ids: "2012.12877"
+  youtube_video_id: "A3RrAIx-KCc"
+}

From 71ac85397506a812af9b9467ff37b8403467e86f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EC=BD=94=EB=94=A9=EB=83=84=EB=B9=84?= <kkweon@gmail.com>
Date: Wed, 14 Jul 2021 20:32:57 +0000
Subject: [PATCH 2/2] chore: update database

---
 server/internal/data/database.pbtxt | 458 ++++++++++++++++++++--------
 1 file changed, 330 insertions(+), 128 deletions(-)

diff --git a/server/internal/data/database.pbtxt b/server/internal/data/database.pbtxt
index 1228dfe8..2b5425ce 100644
--- a/server/internal/data/database.pbtxt
+++ b/server/internal/data/database.pbtxt
@@ -97,7 +97,7 @@ pr_id_to_video: {
       video_id: "L3hz57whyNw"
       video_title: "PR-001: Generative adversarial nets by Jaejun Yoo (2017/4/13)"
       number_of_likes: 256
-      number_of_views: 34725
+      number_of_views: 34726
       published_date: {
         seconds: 1492839397
       }
@@ -883,7 +883,7 @@ pr_id_to_video: {
     video: {
       video_id: "tOItokBZSfU"
       video_title: "PR-009: Distilling the Knowledge in a Neural Network (Slide: English, Speaking: Korean)"
-      number_of_likes: 45
+      number_of_likes: 44
       number_of_views: 6498
       published_date: {
         seconds: 1495514577
@@ -984,7 +984,7 @@ pr_id_to_video: {
       video_id: "KYA-GEhObIs"
       video_title: "PR-010: Auto-Encoding Variational Bayes, ICLR 2014"
       number_of_likes: 205
-      number_of_views: 12218
+      number_of_views: 12220
       published_date: {
         seconds: 1495549847
       }
@@ -1958,7 +1958,7 @@ pr_id_to_video: {
       video_id: "h2WSVBAC1t4"
       video_title: "PR-019: Continuous Control with Deep Reinforcement Learning"
       number_of_likes: 52
-      number_of_views: 5368
+      number_of_views: 5369
       published_date: {
         seconds: 1498452479
       }
@@ -2530,7 +2530,7 @@ pr_id_to_video: {
       video_id: "6fdclSGgeio"
       video_title: "PR-023: YOLO9000: Better, Faster, Stronger"
       number_of_likes: 96
-      number_of_views: 12623
+      number_of_views: 12624
       published_date: {
         seconds: 1500299473
       }
@@ -5104,7 +5104,7 @@ pr_id_to_video: {
       video_id: "iCgT8G4PkqI"
       video_title: "PR-051: Conditional Generative Adversarial Nets"
       number_of_likes: 25
-      number_of_views: 3517
+      number_of_views: 3518
       published_date: {
         seconds: 1512310569
       }
@@ -7425,7 +7425,7 @@ pr_id_to_video: {
       video_id: "UXVKSSXdwb8"
       video_title: "PR-079: Synthesizing Audio with Generative Adversarial Networks"
       number_of_likes: 21
-      number_of_views: 1310
+      number_of_views: 1311
       published_date: {
         seconds: 1523206394
       }
@@ -8073,7 +8073,7 @@ pr_id_to_video: {
       video_id: "iXSYqohGQhM"
       video_title: "PR-087: Spectral Normalization for Generative Adversarial Networks"
       number_of_likes: 45
-      number_of_views: 4735
+      number_of_views: 4736
       published_date: {
         seconds: 1526221916
       }
@@ -10666,7 +10666,7 @@ pr_id_to_video: {
       video_id: "8PoewOpK6b4"
       video_title: "PR-125: ENERGY-BASED GENERATIVE ADVERSARIAL NETWORKS"
       number_of_likes: 7
-      number_of_views: 711
+      number_of_views: 712
       published_date: {
         seconds: 1544368518
       }
@@ -15776,6 +15776,11 @@ pr_id_to_video: {
         full_name: "Bottleneck Residual Block"
         description: "A **Bottleneck Residual Block** is a variant of the [residual block](https://paperswithcode.com/method/residual-block) that utilises 1x1 convolutions to create a bottleneck. The use of a bottleneck reduces the number of parameters and matrix multiplications. The idea is to make residual blocks as thin as possible to increase depth and have less parameters. They were introduced as part of the [ResNet](https://paperswithcode.com/method/resnet) architecture, and are used as part of deeper ResNets such as ResNet-50 and ResNet-101."
       }
+      methods: {
+        name: "Weight Decay"
+        full_name: "Weight Decay"
+        description: "**Weight Decay**, or **$L_{2}$ Regularization**, is a regularization technique applied to the weights of a neural network. We minimize a loss function compromising both the primary loss function and a penalty on the $L\\_{2}$ Norm of the weights:\r\n\r\n$$L\\_{new}\\left(w\\right) = L\\_{original}\\left(w\\right) + \\lambda{w^{T}w}$$\r\n\r\nwhere $\\lambda$ is a value determining the strength of the penalty (encouraging smaller weights). \r\n\r\nWeight decay can be incorporated directly into the weight update rule, rather than just implicitly by defining it through to objective function. Often weight decay refers to the implementation where we specify it directly in the weight update rule (whereas L2 regularization is usually the implementation which is specified in the objective function).\r\n\r\nImage Source: Deep Learning, Goodfellow et al"
+      }
       methods: {
         name: "Attention Dropout"
         full_name: "Attention Dropout"
@@ -15796,11 +15801,6 @@ pr_id_to_video: {
         full_name: "BERT"
         description: "**BERT**, or Bidirectional Encoder Representations from Transformers, improves upon standard [Transformers](http://paperswithcode.com/method/transformer) by removing the unidirectionality constraint by using a *masked language model* (MLM) pre-training objective. The masked language model randomly masks some of the tokens from the input, and the objective is to predict the original vocabulary id of the masked word based only on its context. Unlike left-to-right language model pre-training, the MLM objective enables the representation to fuse the left and the right context, which allows us to pre-train a deep bidirectional Transformer. In addition to the masked language model, BERT uses a *next sentence prediction* task that jointly pre-trains text-pair representations. \r\n\r\nThere are two steps in BERT: *pre-training* and *fine-tuning*. During pre-training, the model is trained on unlabeled data over different pre-training tasks. For fine-tuning, the BERT model is first initialized with the pre-trained parameters, and all of the parameters are fine-tuned using labeled data from the downstream tasks. Each downstream task has separate fine-tuned models, even though they\r\nare initialized with the same pre-trained parameters."
       }
-      methods: {
-        name: "Weight Decay"
-        full_name: "Weight Decay"
-        description: "**Weight Decay**, or **$L_{2}$ Regularization**, is a regularization technique applied to the weights of a neural network. We minimize a loss function compromising both the primary loss function and a penalty on the $L\\_{2}$ Norm of the weights:\r\n\r\n$$L\\_{new}\\left(w\\right) = L\\_{original}\\left(w\\right) + \\lambda{w^{T}w}$$\r\n\r\nwhere $\\lambda$ is a value determining the strength of the penalty (encouraging smaller weights). \r\n\r\nWeight decay can be incorporated directly into the weight update rule, rather than just implicitly by defining it through to objective function. Often weight decay refers to the implementation where we specify it directly in the weight update rule (whereas L2 regularization is usually the implementation which is specified in the objective function).\r\n\r\nImage Source: Deep Learning, Goodfellow et al"
-      }
       methods: {
         name: "Adam"
         full_name: "Adam"
@@ -15826,7 +15826,7 @@ pr_id_to_video: {
       video_id: "YiKn93Ud4dA"
       video_title: "PR-189: Unsupervised Data Augmentation for Consistency Training"
       number_of_likes: 18
-      number_of_views: 1201
+      number_of_views: 1202
       published_date: {
         seconds: 1566745737
       }
@@ -15950,7 +15950,7 @@ pr_id_to_video: {
       video_id: "xaABseUoHAI"
       video_title: "PR-190: A Baseline For Detecting Misclassified and Out-of-Distribution  Examples In Neural Networks"
       number_of_likes: 10
-      number_of_views: 1071
+      number_of_views: 1072
       published_date: {
         seconds: 1569764236
       }
@@ -17288,7 +17288,7 @@ pr_id_to_video: {
       video_id: "HMgcvgRrDcA"
       video_title: "PR-207: YOLOv3: An Incremental Improvement"
       number_of_likes: 117
-      number_of_views: 7473
+      number_of_views: 7474
       published_date: {
         seconds: 1574001134
       }
@@ -19985,6 +19985,11 @@ pr_id_to_video: {
         number_of_stars: 1
         description: "Object Detection with YOLO and Streamlit"
       }
+      methods: {
+        name: "Pointwise Convolution"
+        full_name: "Pointwise Convolution"
+        description: "**Pointwise Convolution** is a type of convolution that uses a 1x1 kernel: a kernel that iterates through every single point. This kernel has a depth of however many channels the input image has. It can be used in conjunction with [depthwise convolutions](https://paperswithcode.com/method/depthwise-convolution) to produce an efficient class of convolutions known as [depthwise-separable convolutions](https://paperswithcode.com/method/depthwise-separable-convolution).\r\n\r\nImage Credit: [Chi-Feng Wang](https://towardsdatascience.com/a-basic-introduction-to-separable-convolutions-b99ec3102728)"
+      }
       methods: {
         name: "Spatial Attention Module"
         full_name: "Spatial Attention Module"
@@ -20001,39 +20006,34 @@ pr_id_to_video: {
         description: "**PAFPN** is a feature pyramid module used in Path Aggregation networks ([PANet](https://paperswithcode.com/method/panet)) that combines FPNs with bottom-up path augmentation, which shortens the information path between lower layers and topmost feature."
       }
       methods: {
-        name: "YOLOv3"
-        full_name: "YOLOv3"
-        description: "**YOLOv3** is a real-time, single-stage object detection model that builds on [YOLOv2](https://paperswithcode.com/method/yolov2) with several improvements. Improvements include the use of a new backbone network, Darknet-53 that utilises residual connections, or in the words of the author, \"those newfangled residual network stuff\", as well as some improvements to the bounding box prediction step, and use of three different scales from which to extract features (similar to an FPN)."
-      }
-      methods: {
-        name: "Pointwise Convolution"
-        full_name: "Pointwise Convolution"
-        description: "**Pointwise Convolution** is a type of convolution that uses a 1x1 kernel: a kernel that iterates through every single point. This kernel has a depth of however many channels the input image has. It can be used in conjunction with [depthwise convolutions](https://paperswithcode.com/method/depthwise-convolution) to produce an efficient class of convolutions known as [depthwise-separable convolutions](https://paperswithcode.com/method/depthwise-separable-convolution).\r\n\r\nImage Credit: [Chi-Feng Wang](https://towardsdatascience.com/a-basic-introduction-to-separable-convolutions-b99ec3102728)"
+        name: "DIoU-NMS"
+        full_name: "DIoU-NMS"
+        description: "**DIoU-NMS** is a type of non-maximum suppression where we use Distance IoU rather than regular DIoU, in which the overlap area and the distance between two central points of bounding boxes are simultaneously considered when suppressing redundant boxes.\r\n\r\nIn original NMS, the IoU metric is used to suppress the redundant detection boxes, where the overlap area is the unique factor, often yielding false suppression for the cases with occlusion. With DIoU-NMS, we not only consider the overlap area but also central point distance between two boxes."
       }
       methods: {
-        name: "Depthwise Separable Convolution"
-        full_name: "Depthwise Separable Convolution"
-        description: "While [standard convolution](https://paperswithcode.com/method/convolution) performs the channelwise and spatial-wise computation in one step, **Depthwise Separable Convolution**  splits the computation into two steps: depthwise convolution applies a single convolutional filter per each input channel and pointwise convolution is used to create a linear combination of the output of the depthwise convolution. The comparison of standard convolution and depthwise separable convolution is shown to the right.\r\n\r\nCredit: [Depthwise Convolution Is All You Need for Learning Multiple Visual Domains](https://paperswithcode.com/paper/depthwise-convolution-is-all-you-need-for)"
+        name: "Cosine Annealing"
+        full_name: "Cosine Annealing"
+        description: "**Cosine Annealing** is a type of learning rate schedule that has the effect of starting with a large learning rate that is relatively rapidly decreased to a minimum value before being increased rapidly again. The resetting of the learning rate acts like a simulated restart of the learning process and the re-use of good weights as the starting point of the restart is referred to as a \"warm restart\" in contrast to a \"cold restart\" where a new set of small random numbers may be used as a starting point.\r\n\r\n$$\\eta\\_{t} = \\eta\\_{min}^{i} + \\frac{1}{2}\\left(\\eta\\_{max}^{i}-\\eta\\_{min}^{i}\\right)\\left(1+\\cos\\left(\\frac{T\\_{cur}}{T\\_{i}}\\pi\\right)\\right)\r\n$$\r\n\r\nWhere where $\\eta\\_{min}^{i}$ and $ \\eta\\_{max}^{i}$ are ranges for the learning rate, and $T\\_{cur}$ account for how many epochs have been performed since the last restart.\r\n\r\nText Source: [Jason Brownlee](https://machinelearningmastery.com/snapshot-ensemble-deep-learning-neural-network/)\r\n\r\nImage Source: [Gao Huang](https://www.researchgate.net/figure/Training-loss-of-100-layer-DenseNet-on-CIFAR10-using-standard-learning-rate-blue-and-M_fig2_315765130)"
       }
       methods: {
-        name: "k-Means Clustering"
-        full_name: "k-Means Clustering"
-        description: "**k-Means Clustering** is a clustering algorithm that divides a training set into $k$ different clusters of examples that are near each other. It works by initializing $k$ different centroids {$\\mu\\left(1\\right),\\ldots,\\mu\\left(k\\right)$} to different values, then alternating between two steps until convergence:\r\n\r\n(i) each training example is assigned to cluster $i$ where $i$ is the index of the nearest centroid $\\mu^{(i)}$\r\n\r\n(ii) each centroid $\\mu^{(i)}$ is updated to the mean of all training examples $x^{(j)}$ assigned to cluster $i$.\r\n\r\nText Source: Deep Learning, Goodfellow et al\r\n\r\nImage Source: [scikit-learn](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html)"
+        name: "Weight Decay"
+        full_name: "Weight Decay"
+        description: "**Weight Decay**, or **$L_{2}$ Regularization**, is a regularization technique applied to the weights of a neural network. We minimize a loss function compromising both the primary loss function and a penalty on the $L\\_{2}$ Norm of the weights:\r\n\r\n$$L\\_{new}\\left(w\\right) = L\\_{original}\\left(w\\right) + \\lambda{w^{T}w}$$\r\n\r\nwhere $\\lambda$ is a value determining the strength of the penalty (encouraging smaller weights). \r\n\r\nWeight decay can be incorporated directly into the weight update rule, rather than just implicitly by defining it through to objective function. Often weight decay refers to the implementation where we specify it directly in the weight update rule (whereas L2 regularization is usually the implementation which is specified in the objective function).\r\n\r\nImage Source: Deep Learning, Goodfellow et al"
       }
       methods: {
-        name: "BiFPN"
-        full_name: "BiFPN"
-        description: "A **BiFPN**, or **Weighted Bi-directional Feature Pyramid Network**, is a type of feature pyramid network which allows easy and fast multi-scale feature fusion. It incorporates the multi-level feature fusion idea from [FPN](https://paperswithcode.com/method/fpn), [PANet](https://paperswithcode.com/method/panet) and [NAS-FPN](https://paperswithcode.com/method/nas-fpn) that enables information to flow in both the top-down and bottom-up directions, while using regular and efficient connections. It also utilizes a fast normalized fusion technique. Traditional approaches usually treat all features input to the FPN equally, even those with different resolutions. However, input features at different resolutions often have unequal contributions to the output features. Thus, the BiFPN adds an additional weight for each input feature allowing the network to learn the importance of each. All regular convolutions are also replaced with less expensive depthwise separable convolutions.\r\n\r\nComparing with PANet, PANet added an extra bottom-up path for information flow at the expense of more computational cost. Whereas BiFPN optimizes these cross-scale connections by removing nodes with a single input edge, adding an extra edge from the original input to output node if they are on the same level, and treating each bidirectional path as one feature network layer (repeating it several times for more high-level future fusion)."
+        name: "YOLOv3"
+        full_name: "YOLOv3"
+        description: "**YOLOv3** is a real-time, single-stage object detection model that builds on [YOLOv2](https://paperswithcode.com/method/yolov2) with several improvements. Improvements include the use of a new backbone network, Darknet-53 that utilises residual connections, or in the words of the author, \"those newfangled residual network stuff\", as well as some improvements to the bounding box prediction step, and use of three different scales from which to extract features (similar to an FPN)."
       }
       methods: {
-        name: "ResNeXt Block"
-        full_name: "ResNeXt Block"
-        description: "A **ResNeXt Block** is a type of residual block used as part of the [ResNeXt](https://paperswithcode.com/method/resnext) CNN architecture. It uses a \"split-transform-merge\" strategy (branched paths within a single module) similar to an [Inception module](https://paperswithcode.com/method/inception-module), i.e. it aggregates a set of transformations. Compared to a Residual Block, it exposes a new dimension,  *cardinality* (size of set of transformations) $C$, as an essential factor in addition to depth and width. \r\n\r\nFormally, a set of aggregated transformations can be represented as: $\\mathcal{F}(x)=\\sum_{i=1}^{C}\\mathcal{T}_i(x)$, where $\\mathcal{T}_i(x)$ can be an arbitrary function. Analogous to a simple neuron, $\\mathcal{T}_i$ should project $x$ into an (optionally low-dimensional) embedding and then transform it."
+        name: "RoIAlign"
+        full_name: "RoIAlign"
+        description: "**Region of Interest Align**, or **RoIAlign**, is an operation for extracting a small feature map from each RoI in detection and segmentation based tasks. It removes the harsh quantization of [RoI Pool](https://paperswithcode.com/method/roi-pooling), properly *aligning* the extracted features with the input. To avoid any quantization of the RoI boundaries or bins (using $x/16$ instead of $[x/16]$), RoIAlign uses bilinear interpolation to compute the exact values of the input features at four regularly sampled locations in each RoI bin, and the result is then aggregated (using max or average)."
       }
       methods: {
-        name: "Darknet-53"
-        full_name: "Darknet-53"
-        description: "**Darknet-53** is a convolutional neural network that acts as a backbone for the [YOLOv3](https://paperswithcode.com/method/yolov3) object detection approach. The improvements upon its predecessor [Darknet-19](https://paperswithcode.com/method/darknet-19) include the use of residual connections, as well as more layers."
+        name: "Depthwise Separable Convolution"
+        full_name: "Depthwise Separable Convolution"
+        description: "While [standard convolution](https://paperswithcode.com/method/convolution) performs the channelwise and spatial-wise computation in one step, **Depthwise Separable Convolution**  splits the computation into two steps: depthwise convolution applies a single convolutional filter per each input channel and pointwise convolution is used to create a linear combination of the output of the depthwise convolution. The comparison of standard convolution and depthwise separable convolution is shown to the right.\r\n\r\nCredit: [Depthwise Convolution Is All You Need for Learning Multiple Visual Domains](https://paperswithcode.com/paper/depthwise-convolution-is-all-you-need-for)"
       }
     }
     papers: {
@@ -26429,7 +26429,7 @@ pr_id_to_video: {
       video_id: "d2IaWtBbJjg"
       video_title: "PR-242: BERT4Rec -Sequential Recommendation with BERT"
       number_of_likes: 12
-      number_of_views: 858
+      number_of_views: 859
       published_date: {
         seconds: 1593349958
       }
@@ -27832,6 +27832,11 @@ pr_id_to_video: {
         number_of_stars: 1
         description: "Object Detection with YOLO and Streamlit"
       }
+      methods: {
+        name: "Pointwise Convolution"
+        full_name: "Pointwise Convolution"
+        description: "**Pointwise Convolution** is a type of convolution that uses a 1x1 kernel: a kernel that iterates through every single point. This kernel has a depth of however many channels the input image has. It can be used in conjunction with [depthwise convolutions](https://paperswithcode.com/method/depthwise-convolution) to produce an efficient class of convolutions known as [depthwise-separable convolutions](https://paperswithcode.com/method/depthwise-separable-convolution).\r\n\r\nImage Credit: [Chi-Feng Wang](https://towardsdatascience.com/a-basic-introduction-to-separable-convolutions-b99ec3102728)"
+      }
       methods: {
         name: "Spatial Attention Module"
         full_name: "Spatial Attention Module"
@@ -27848,39 +27853,34 @@ pr_id_to_video: {
         description: "**PAFPN** is a feature pyramid module used in Path Aggregation networks ([PANet](https://paperswithcode.com/method/panet)) that combines FPNs with bottom-up path augmentation, which shortens the information path between lower layers and topmost feature."
       }
       methods: {
-        name: "YOLOv3"
-        full_name: "YOLOv3"
-        description: "**YOLOv3** is a real-time, single-stage object detection model that builds on [YOLOv2](https://paperswithcode.com/method/yolov2) with several improvements. Improvements include the use of a new backbone network, Darknet-53 that utilises residual connections, or in the words of the author, \"those newfangled residual network stuff\", as well as some improvements to the bounding box prediction step, and use of three different scales from which to extract features (similar to an FPN)."
+        name: "DIoU-NMS"
+        full_name: "DIoU-NMS"
+        description: "**DIoU-NMS** is a type of non-maximum suppression where we use Distance IoU rather than regular DIoU, in which the overlap area and the distance between two central points of bounding boxes are simultaneously considered when suppressing redundant boxes.\r\n\r\nIn original NMS, the IoU metric is used to suppress the redundant detection boxes, where the overlap area is the unique factor, often yielding false suppression for the cases with occlusion. With DIoU-NMS, we not only consider the overlap area but also central point distance between two boxes."
       }
       methods: {
-        name: "Pointwise Convolution"
-        full_name: "Pointwise Convolution"
-        description: "**Pointwise Convolution** is a type of convolution that uses a 1x1 kernel: a kernel that iterates through every single point. This kernel has a depth of however many channels the input image has. It can be used in conjunction with [depthwise convolutions](https://paperswithcode.com/method/depthwise-convolution) to produce an efficient class of convolutions known as [depthwise-separable convolutions](https://paperswithcode.com/method/depthwise-separable-convolution).\r\n\r\nImage Credit: [Chi-Feng Wang](https://towardsdatascience.com/a-basic-introduction-to-separable-convolutions-b99ec3102728)"
-      }
-      methods: {
-        name: "Depthwise Separable Convolution"
-        full_name: "Depthwise Separable Convolution"
-        description: "While [standard convolution](https://paperswithcode.com/method/convolution) performs the channelwise and spatial-wise computation in one step, **Depthwise Separable Convolution**  splits the computation into two steps: depthwise convolution applies a single convolutional filter per each input channel and pointwise convolution is used to create a linear combination of the output of the depthwise convolution. The comparison of standard convolution and depthwise separable convolution is shown to the right.\r\n\r\nCredit: [Depthwise Convolution Is All You Need for Learning Multiple Visual Domains](https://paperswithcode.com/paper/depthwise-convolution-is-all-you-need-for)"
+        name: "Cosine Annealing"
+        full_name: "Cosine Annealing"
+        description: "**Cosine Annealing** is a type of learning rate schedule that has the effect of starting with a large learning rate that is relatively rapidly decreased to a minimum value before being increased rapidly again. The resetting of the learning rate acts like a simulated restart of the learning process and the re-use of good weights as the starting point of the restart is referred to as a \"warm restart\" in contrast to a \"cold restart\" where a new set of small random numbers may be used as a starting point.\r\n\r\n$$\\eta\\_{t} = \\eta\\_{min}^{i} + \\frac{1}{2}\\left(\\eta\\_{max}^{i}-\\eta\\_{min}^{i}\\right)\\left(1+\\cos\\left(\\frac{T\\_{cur}}{T\\_{i}}\\pi\\right)\\right)\r\n$$\r\n\r\nWhere where $\\eta\\_{min}^{i}$ and $ \\eta\\_{max}^{i}$ are ranges for the learning rate, and $T\\_{cur}$ account for how many epochs have been performed since the last restart.\r\n\r\nText Source: [Jason Brownlee](https://machinelearningmastery.com/snapshot-ensemble-deep-learning-neural-network/)\r\n\r\nImage Source: [Gao Huang](https://www.researchgate.net/figure/Training-loss-of-100-layer-DenseNet-on-CIFAR10-using-standard-learning-rate-blue-and-M_fig2_315765130)"
       }
       methods: {
-        name: "k-Means Clustering"
-        full_name: "k-Means Clustering"
-        description: "**k-Means Clustering** is a clustering algorithm that divides a training set into $k$ different clusters of examples that are near each other. It works by initializing $k$ different centroids {$\\mu\\left(1\\right),\\ldots,\\mu\\left(k\\right)$} to different values, then alternating between two steps until convergence:\r\n\r\n(i) each training example is assigned to cluster $i$ where $i$ is the index of the nearest centroid $\\mu^{(i)}$\r\n\r\n(ii) each centroid $\\mu^{(i)}$ is updated to the mean of all training examples $x^{(j)}$ assigned to cluster $i$.\r\n\r\nText Source: Deep Learning, Goodfellow et al\r\n\r\nImage Source: [scikit-learn](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html)"
+        name: "Weight Decay"
+        full_name: "Weight Decay"
+        description: "**Weight Decay**, or **$L_{2}$ Regularization**, is a regularization technique applied to the weights of a neural network. We minimize a loss function compromising both the primary loss function and a penalty on the $L\\_{2}$ Norm of the weights:\r\n\r\n$$L\\_{new}\\left(w\\right) = L\\_{original}\\left(w\\right) + \\lambda{w^{T}w}$$\r\n\r\nwhere $\\lambda$ is a value determining the strength of the penalty (encouraging smaller weights). \r\n\r\nWeight decay can be incorporated directly into the weight update rule, rather than just implicitly by defining it through to objective function. Often weight decay refers to the implementation where we specify it directly in the weight update rule (whereas L2 regularization is usually the implementation which is specified in the objective function).\r\n\r\nImage Source: Deep Learning, Goodfellow et al"
       }
       methods: {
-        name: "BiFPN"
-        full_name: "BiFPN"
-        description: "A **BiFPN**, or **Weighted Bi-directional Feature Pyramid Network**, is a type of feature pyramid network which allows easy and fast multi-scale feature fusion. It incorporates the multi-level feature fusion idea from [FPN](https://paperswithcode.com/method/fpn), [PANet](https://paperswithcode.com/method/panet) and [NAS-FPN](https://paperswithcode.com/method/nas-fpn) that enables information to flow in both the top-down and bottom-up directions, while using regular and efficient connections. It also utilizes a fast normalized fusion technique. Traditional approaches usually treat all features input to the FPN equally, even those with different resolutions. However, input features at different resolutions often have unequal contributions to the output features. Thus, the BiFPN adds an additional weight for each input feature allowing the network to learn the importance of each. All regular convolutions are also replaced with less expensive depthwise separable convolutions.\r\n\r\nComparing with PANet, PANet added an extra bottom-up path for information flow at the expense of more computational cost. Whereas BiFPN optimizes these cross-scale connections by removing nodes with a single input edge, adding an extra edge from the original input to output node if they are on the same level, and treating each bidirectional path as one feature network layer (repeating it several times for more high-level future fusion)."
+        name: "YOLOv3"
+        full_name: "YOLOv3"
+        description: "**YOLOv3** is a real-time, single-stage object detection model that builds on [YOLOv2](https://paperswithcode.com/method/yolov2) with several improvements. Improvements include the use of a new backbone network, Darknet-53 that utilises residual connections, or in the words of the author, \"those newfangled residual network stuff\", as well as some improvements to the bounding box prediction step, and use of three different scales from which to extract features (similar to an FPN)."
       }
       methods: {
-        name: "ResNeXt Block"
-        full_name: "ResNeXt Block"
-        description: "A **ResNeXt Block** is a type of residual block used as part of the [ResNeXt](https://paperswithcode.com/method/resnext) CNN architecture. It uses a \"split-transform-merge\" strategy (branched paths within a single module) similar to an [Inception module](https://paperswithcode.com/method/inception-module), i.e. it aggregates a set of transformations. Compared to a Residual Block, it exposes a new dimension,  *cardinality* (size of set of transformations) $C$, as an essential factor in addition to depth and width. \r\n\r\nFormally, a set of aggregated transformations can be represented as: $\\mathcal{F}(x)=\\sum_{i=1}^{C}\\mathcal{T}_i(x)$, where $\\mathcal{T}_i(x)$ can be an arbitrary function. Analogous to a simple neuron, $\\mathcal{T}_i$ should project $x$ into an (optionally low-dimensional) embedding and then transform it."
+        name: "RoIAlign"
+        full_name: "RoIAlign"
+        description: "**Region of Interest Align**, or **RoIAlign**, is an operation for extracting a small feature map from each RoI in detection and segmentation based tasks. It removes the harsh quantization of [RoI Pool](https://paperswithcode.com/method/roi-pooling), properly *aligning* the extracted features with the input. To avoid any quantization of the RoI boundaries or bins (using $x/16$ instead of $[x/16]$), RoIAlign uses bilinear interpolation to compute the exact values of the input features at four regularly sampled locations in each RoI bin, and the result is then aggregated (using max or average)."
       }
       methods: {
-        name: "Darknet-53"
-        full_name: "Darknet-53"
-        description: "**Darknet-53** is a convolutional neural network that acts as a backbone for the [YOLOv3](https://paperswithcode.com/method/yolov3) object detection approach. The improvements upon its predecessor [Darknet-19](https://paperswithcode.com/method/darknet-19) include the use of residual connections, as well as more layers."
+        name: "Depthwise Separable Convolution"
+        full_name: "Depthwise Separable Convolution"
+        description: "While [standard convolution](https://paperswithcode.com/method/convolution) performs the channelwise and spatial-wise computation in one step, **Depthwise Separable Convolution**  splits the computation into two steps: depthwise convolution applies a single convolutional filter per each input channel and pointwise convolution is used to create a linear combination of the output of the depthwise convolution. The comparison of standard convolution and depthwise separable convolution is shown to the right.\r\n\r\nCredit: [Depthwise Convolution Is All You Need for Learning Multiple Visual Domains](https://paperswithcode.com/paper/depthwise-convolution-is-all-you-need-for)"
       }
     }
     papers: {
@@ -29518,31 +29518,11 @@ pr_id_to_video: {
         number_of_stars: 2233
         description: "Sandbox for training deep learning networks"
       }
-      methods: {
-        name: "Random Resized Crop"
-        full_name: "Random Resized Crop"
-        description: "**RandomResizedCrop** is a type of image data augmentation where a crop of random size of the original size and a random aspect ratio of the original aspect ratio is made. This crop is finally resized to given size.\r\n\r\nImage Credit: [Apache MXNet](https://mxnet.apache.org/versions/1.5.0/tutorials/gluon/data_augmentation.html)"
-      }
       methods: {
         name: "Pointwise Convolution"
         full_name: "Pointwise Convolution"
         description: "**Pointwise Convolution** is a type of convolution that uses a 1x1 kernel: a kernel that iterates through every single point. This kernel has a depth of however many channels the input image has. It can be used in conjunction with [depthwise convolutions](https://paperswithcode.com/method/depthwise-convolution) to produce an efficient class of convolutions known as [depthwise-separable convolutions](https://paperswithcode.com/method/depthwise-separable-convolution).\r\n\r\nImage Credit: [Chi-Feng Wang](https://towardsdatascience.com/a-basic-introduction-to-separable-convolutions-b99ec3102728)"
       }
-      methods: {
-        name: "Depthwise Separable Convolution"
-        full_name: "Depthwise Separable Convolution"
-        description: "While [standard convolution](https://paperswithcode.com/method/convolution) performs the channelwise and spatial-wise computation in one step, **Depthwise Separable Convolution**  splits the computation into two steps: depthwise convolution applies a single convolutional filter per each input channel and pointwise convolution is used to create a linear combination of the output of the depthwise convolution. The comparison of standard convolution and depthwise separable convolution is shown to the right.\r\n\r\nCredit: [Depthwise Convolution Is All You Need for Learning Multiple Visual Domains](https://paperswithcode.com/paper/depthwise-convolution-is-all-you-need-for)"
-      }
-      methods: {
-        name: "Image Scale Augmentation"
-        full_name: "Image Scale Augmentation"
-        description: "Image Scale Augmentation is an augmentation technique where we randomly pick the short size of a image within a dimension range. One use case of this augmentation technique is in object detectiont asks."
-      }
-      methods: {
-        name: "EfficientNet"
-        full_name: "EfficientNet"
-        description: "**EfficientNet** is a convolutional neural network architecture and scaling method that uniformly scales all dimensions of depth/width/resolution using a *compound coefficient*. Unlike conventional practice that arbitrary scales  these factors, the EfficientNet scaling method uniformly scales network width, depth, and resolution with a set of fixed scaling coefficients. For example, if we want to use $2^N$ times more computational resources, then we can simply increase the network depth by $\\alpha ^ N$,  width by $\\beta ^ N$, and image size by $\\gamma ^ N$, where $\\alpha, \\beta, \\gamma$ are constant coefficients determined by a small grid search on the original small model. EfficientNet uses a compound coefficient $\\phi$ to uniformly scales network width, depth, and resolution in a  principled way.\r\n\r\nThe compound scaling method is justified by the intuition that if the input image is bigger, then the network needs more layers to increase the receptive field and more channels to capture more fine-grained patterns on the bigger image.\r\n\r\nThe base EfficientNet-B0 network is based on the inverted bottleneck residual blocks of [MobileNetV2](https://paperswithcode.com/method/mobilenetv2), in addition to squeeze-and-excitation blocks.\r\n\r\n EfficientNets also transfer well and achieve state-of-the-art accuracy on CIFAR-100 (91.7%), Flowers (98.8%), and 3 other transfer learning datasets, with an order of magnitude fewer parameters."
-      }
       methods: {
         name: "DeepLabv3"
         full_name: "DeepLabv3"
@@ -29564,9 +29544,29 @@ pr_id_to_video: {
         description: "**Weight Decay**, or **$L_{2}$ Regularization**, is a regularization technique applied to the weights of a neural network. We minimize a loss function compromising both the primary loss function and a penalty on the $L\\_{2}$ Norm of the weights:\r\n\r\n$$L\\_{new}\\left(w\\right) = L\\_{original}\\left(w\\right) + \\lambda{w^{T}w}$$\r\n\r\nwhere $\\lambda$ is a value determining the strength of the penalty (encouraging smaller weights). \r\n\r\nWeight decay can be incorporated directly into the weight update rule, rather than just implicitly by defining it through to objective function. Often weight decay refers to the implementation where we specify it directly in the weight update rule (whereas L2 regularization is usually the implementation which is specified in the objective function).\r\n\r\nImage Source: Deep Learning, Goodfellow et al"
       }
       methods: {
-        name: "Cutout"
-        full_name: "Cutout"
-        description: "**Cutout** is an image augmentation and regularization technique that randomly masks out square regions of input during training. and can be used to improve the robustness and overall performance of convolutional neural networks. The main motivation for cutout comes from the problem of object occlusion, which is commonly encountered in many computer vision tasks, such as object recognition,\r\ntracking, or human pose estimation. By generating new images which simulate occluded examples, we not only better prepare the model for encounters with occlusions in the real world, but the model also learns to take more of the image context into consideration when making decisions"
+        name: "Random Resized Crop"
+        full_name: "Random Resized Crop"
+        description: "**RandomResizedCrop** is a type of image data augmentation where a crop of random size of the original size and a random aspect ratio of the original aspect ratio is made. This crop is finally resized to given size.\r\n\r\nImage Credit: [Apache MXNet](https://mxnet.apache.org/versions/1.5.0/tutorials/gluon/data_augmentation.html)"
+      }
+      methods: {
+        name: "Deformable Convolution"
+        full_name: "Deformable Convolution"
+        description: "**Deformable convolutions** add 2D offsets to the regular grid sampling locations in the standard convolution. It enables free form deformation of the sampling grid. The offsets are learned from the preceding feature maps, via additional convolutional layers. Thus, the deformation is conditioned on the input features in a local, dense, and adaptive manner."
+      }
+      methods: {
+        name: "RoIAlign"
+        full_name: "RoIAlign"
+        description: "**Region of Interest Align**, or **RoIAlign**, is an operation for extracting a small feature map from each RoI in detection and segmentation based tasks. It removes the harsh quantization of [RoI Pool](https://paperswithcode.com/method/roi-pooling), properly *aligning* the extracted features with the input. To avoid any quantization of the RoI boundaries or bins (using $x/16$ instead of $[x/16]$), RoIAlign uses bilinear interpolation to compute the exact values of the input features at four regularly sampled locations in each RoI bin, and the result is then aggregated (using max or average)."
+      }
+      methods: {
+        name: "Depthwise Separable Convolution"
+        full_name: "Depthwise Separable Convolution"
+        description: "While [standard convolution](https://paperswithcode.com/method/convolution) performs the channelwise and spatial-wise computation in one step, **Depthwise Separable Convolution**  splits the computation into two steps: depthwise convolution applies a single convolutional filter per each input channel and pointwise convolution is used to create a linear combination of the output of the depthwise convolution. The comparison of standard convolution and depthwise separable convolution is shown to the right.\r\n\r\nCredit: [Depthwise Convolution Is All You Need for Learning Multiple Visual Domains](https://paperswithcode.com/paper/depthwise-convolution-is-all-you-need-for)"
+      }
+      methods: {
+        name: "Image Scale Augmentation"
+        full_name: "Image Scale Augmentation"
+        description: "Image Scale Augmentation is an augmentation technique where we randomly pick the short size of a image within a dimension range. One use case of this augmentation technique is in object detectiont asks."
       }
     }
     papers: {
@@ -31444,6 +31444,16 @@ pr_id_to_video: {
         full_name: "Mask R-CNN"
         description: "**Mask R-CNN** extends [Faster R-CNN](http://paperswithcode.com/method/faster-r-cnn) to solve instance segmentation tasks. It achieves this by adding a branch for predicting an object mask in parallel with the existing branch for bounding box recognition. In principle, Mask R-CNN is an intuitive extension of Faster R-CNN, but constructing the mask branch properly is critical for good results. \r\n\r\nMost importantly, Faster R-CNN was not designed for pixel-to-pixel alignment between network inputs and outputs. This is evident in how [RoIPool](http://paperswithcode.com/method/roi-pooling), the *de facto* core operation for attending to instances, performs coarse spatial quantization for feature extraction. To fix the misalignment, Mask R-CNN utilises a simple, quantization-free layer, called [RoIAlign](http://paperswithcode.com/method/roi-align), that faithfully preserves exact spatial locations. \r\n\r\nSecondly, Mask R-CNN *decouples* mask and class prediction: it predicts a binary mask for each class independently, without competition among classes, and relies on the network's RoI classification branch to predict the category. In contrast, an [FCN](http://paperswithcode.com/method/fcn) usually perform per-pixel multi-class categorization, which couples segmentation and classification."
       }
+      methods: {
+        name: "Random Grayscale"
+        full_name: "Random Grayscale"
+        description: "**Random Grayscale**  is an image data augmentation that converts an image to grayscale with probability $p$."
+      }
+      methods: {
+        name: "RoIAlign"
+        full_name: "RoIAlign"
+        description: "**Region of Interest Align**, or **RoIAlign**, is an operation for extracting a small feature map from each RoI in detection and segmentation based tasks. It removes the harsh quantization of [RoI Pool](https://paperswithcode.com/method/roi-pooling), properly *aligning* the extracted features with the input. To avoid any quantization of the RoI boundaries or bins (using $x/16$ instead of $[x/16]$), RoIAlign uses bilinear interpolation to compute the exact values of the input features at four regularly sampled locations in each RoI bin, and the result is then aggregated (using max or average)."
+      }
       methods: {
         name: "Residual Block"
         full_name: "Residual Block"
@@ -31464,16 +31474,6 @@ pr_id_to_video: {
         full_name: "Kaiming Initialization"
         description: "**Kaiming Initialization**, or **He Initialization**, is an initialization method for neural networks that takes into account the non-linearity of activation functions, such as ReLU activations.\r\n\r\nA proper initialization method should avoid reducing or magnifying the magnitudes of input signals exponentially. Using a derivation they work out that the condition to stop this happening is:\r\n\r\n$$\\frac{1}{2}n\\_{l}\\text{Var}\\left[w\\_{l}\\right] = 1 $$\r\n\r\nThis implies an initialization scheme of:\r\n\r\n$$ w\\_{l} \\sim \\mathcal{N}\\left(0,  2/n\\_{l}\\right)$$\r\n\r\nThat is, a zero-centered Gaussian with standard deviation of $\\sqrt{2/{n}\\_{l}}$ (variance shown in equation above). Biases are initialized at $0$."
       }
-      methods: {
-        name: "Random Grayscale"
-        full_name: "Random Grayscale"
-        description: "**Random Grayscale**  is an image data augmentation that converts an image to grayscale with probability $p$."
-      }
-      methods: {
-        name: "RoIAlign"
-        full_name: "RoIAlign"
-        description: "**Region of Interest Align**, or **RoIAlign**, is an operation for extracting a small feature map from each RoI in detection and segmentation based tasks. It removes the harsh quantization of [RoI Pool](https://paperswithcode.com/method/roi-pooling), properly *aligning* the extracted features with the input. To avoid any quantization of the RoI boundaries or bins (using $x/16$ instead of $[x/16]$), RoIAlign uses bilinear interpolation to compute the exact values of the input features at four regularly sampled locations in each RoI bin, and the result is then aggregated (using max or average)."
-      }
     }
     papers: {
       paper_id: "co2-consistent-contrast-for-unsupervised-1"
@@ -34527,7 +34527,7 @@ pr_id_to_video: {
       video_id: "zxXRGhSQ1f4"
       video_title: "PR-271: DeepFM: A Factorization-Machine based Neural Network for CTR Prediction"
       number_of_likes: 12
-      number_of_views: 1103
+      number_of_views: 1104
       published_date: {
         seconds: 1598797388
       }
@@ -35600,7 +35600,7 @@ pr_id_to_video: {
       video_id: "yFIMPxdQTe0"
       video_title: "PR-274: On mutual information maximization for representation learning"
       number_of_likes: 17
-      number_of_views: 891
+      number_of_views: 892
       published_date: {
         seconds: 1600609781
       }
@@ -39675,6 +39675,11 @@ pr_id_to_video: {
         number_of_stars: 1
         description: "Object Detection with YOLO and Streamlit"
       }
+      methods: {
+        name: "Pointwise Convolution"
+        full_name: "Pointwise Convolution"
+        description: "**Pointwise Convolution** is a type of convolution that uses a 1x1 kernel: a kernel that iterates through every single point. This kernel has a depth of however many channels the input image has. It can be used in conjunction with [depthwise convolutions](https://paperswithcode.com/method/depthwise-convolution) to produce an efficient class of convolutions known as [depthwise-separable convolutions](https://paperswithcode.com/method/depthwise-separable-convolution).\r\n\r\nImage Credit: [Chi-Feng Wang](https://towardsdatascience.com/a-basic-introduction-to-separable-convolutions-b99ec3102728)"
+      }
       methods: {
         name: "Spatial Attention Module"
         full_name: "Spatial Attention Module"
@@ -39691,39 +39696,34 @@ pr_id_to_video: {
         description: "**PAFPN** is a feature pyramid module used in Path Aggregation networks ([PANet](https://paperswithcode.com/method/panet)) that combines FPNs with bottom-up path augmentation, which shortens the information path between lower layers and topmost feature."
       }
       methods: {
-        name: "YOLOv3"
-        full_name: "YOLOv3"
-        description: "**YOLOv3** is a real-time, single-stage object detection model that builds on [YOLOv2](https://paperswithcode.com/method/yolov2) with several improvements. Improvements include the use of a new backbone network, Darknet-53 that utilises residual connections, or in the words of the author, \"those newfangled residual network stuff\", as well as some improvements to the bounding box prediction step, and use of three different scales from which to extract features (similar to an FPN)."
-      }
-      methods: {
-        name: "Pointwise Convolution"
-        full_name: "Pointwise Convolution"
-        description: "**Pointwise Convolution** is a type of convolution that uses a 1x1 kernel: a kernel that iterates through every single point. This kernel has a depth of however many channels the input image has. It can be used in conjunction with [depthwise convolutions](https://paperswithcode.com/method/depthwise-convolution) to produce an efficient class of convolutions known as [depthwise-separable convolutions](https://paperswithcode.com/method/depthwise-separable-convolution).\r\n\r\nImage Credit: [Chi-Feng Wang](https://towardsdatascience.com/a-basic-introduction-to-separable-convolutions-b99ec3102728)"
+        name: "DIoU-NMS"
+        full_name: "DIoU-NMS"
+        description: "**DIoU-NMS** is a type of non-maximum suppression where we use Distance IoU rather than regular DIoU, in which the overlap area and the distance between two central points of bounding boxes are simultaneously considered when suppressing redundant boxes.\r\n\r\nIn original NMS, the IoU metric is used to suppress the redundant detection boxes, where the overlap area is the unique factor, often yielding false suppression for the cases with occlusion. With DIoU-NMS, we not only consider the overlap area but also central point distance between two boxes."
       }
       methods: {
-        name: "Depthwise Separable Convolution"
-        full_name: "Depthwise Separable Convolution"
-        description: "While [standard convolution](https://paperswithcode.com/method/convolution) performs the channelwise and spatial-wise computation in one step, **Depthwise Separable Convolution**  splits the computation into two steps: depthwise convolution applies a single convolutional filter per each input channel and pointwise convolution is used to create a linear combination of the output of the depthwise convolution. The comparison of standard convolution and depthwise separable convolution is shown to the right.\r\n\r\nCredit: [Depthwise Convolution Is All You Need for Learning Multiple Visual Domains](https://paperswithcode.com/paper/depthwise-convolution-is-all-you-need-for)"
+        name: "Cosine Annealing"
+        full_name: "Cosine Annealing"
+        description: "**Cosine Annealing** is a type of learning rate schedule that has the effect of starting with a large learning rate that is relatively rapidly decreased to a minimum value before being increased rapidly again. The resetting of the learning rate acts like a simulated restart of the learning process and the re-use of good weights as the starting point of the restart is referred to as a \"warm restart\" in contrast to a \"cold restart\" where a new set of small random numbers may be used as a starting point.\r\n\r\n$$\\eta\\_{t} = \\eta\\_{min}^{i} + \\frac{1}{2}\\left(\\eta\\_{max}^{i}-\\eta\\_{min}^{i}\\right)\\left(1+\\cos\\left(\\frac{T\\_{cur}}{T\\_{i}}\\pi\\right)\\right)\r\n$$\r\n\r\nWhere where $\\eta\\_{min}^{i}$ and $ \\eta\\_{max}^{i}$ are ranges for the learning rate, and $T\\_{cur}$ account for how many epochs have been performed since the last restart.\r\n\r\nText Source: [Jason Brownlee](https://machinelearningmastery.com/snapshot-ensemble-deep-learning-neural-network/)\r\n\r\nImage Source: [Gao Huang](https://www.researchgate.net/figure/Training-loss-of-100-layer-DenseNet-on-CIFAR10-using-standard-learning-rate-blue-and-M_fig2_315765130)"
       }
       methods: {
-        name: "k-Means Clustering"
-        full_name: "k-Means Clustering"
-        description: "**k-Means Clustering** is a clustering algorithm that divides a training set into $k$ different clusters of examples that are near each other. It works by initializing $k$ different centroids {$\\mu\\left(1\\right),\\ldots,\\mu\\left(k\\right)$} to different values, then alternating between two steps until convergence:\r\n\r\n(i) each training example is assigned to cluster $i$ where $i$ is the index of the nearest centroid $\\mu^{(i)}$\r\n\r\n(ii) each centroid $\\mu^{(i)}$ is updated to the mean of all training examples $x^{(j)}$ assigned to cluster $i$.\r\n\r\nText Source: Deep Learning, Goodfellow et al\r\n\r\nImage Source: [scikit-learn](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html)"
+        name: "Weight Decay"
+        full_name: "Weight Decay"
+        description: "**Weight Decay**, or **$L_{2}$ Regularization**, is a regularization technique applied to the weights of a neural network. We minimize a loss function compromising both the primary loss function and a penalty on the $L\\_{2}$ Norm of the weights:\r\n\r\n$$L\\_{new}\\left(w\\right) = L\\_{original}\\left(w\\right) + \\lambda{w^{T}w}$$\r\n\r\nwhere $\\lambda$ is a value determining the strength of the penalty (encouraging smaller weights). \r\n\r\nWeight decay can be incorporated directly into the weight update rule, rather than just implicitly by defining it through to objective function. Often weight decay refers to the implementation where we specify it directly in the weight update rule (whereas L2 regularization is usually the implementation which is specified in the objective function).\r\n\r\nImage Source: Deep Learning, Goodfellow et al"
       }
       methods: {
-        name: "BiFPN"
-        full_name: "BiFPN"
-        description: "A **BiFPN**, or **Weighted Bi-directional Feature Pyramid Network**, is a type of feature pyramid network which allows easy and fast multi-scale feature fusion. It incorporates the multi-level feature fusion idea from [FPN](https://paperswithcode.com/method/fpn), [PANet](https://paperswithcode.com/method/panet) and [NAS-FPN](https://paperswithcode.com/method/nas-fpn) that enables information to flow in both the top-down and bottom-up directions, while using regular and efficient connections. It also utilizes a fast normalized fusion technique. Traditional approaches usually treat all features input to the FPN equally, even those with different resolutions. However, input features at different resolutions often have unequal contributions to the output features. Thus, the BiFPN adds an additional weight for each input feature allowing the network to learn the importance of each. All regular convolutions are also replaced with less expensive depthwise separable convolutions.\r\n\r\nComparing with PANet, PANet added an extra bottom-up path for information flow at the expense of more computational cost. Whereas BiFPN optimizes these cross-scale connections by removing nodes with a single input edge, adding an extra edge from the original input to output node if they are on the same level, and treating each bidirectional path as one feature network layer (repeating it several times for more high-level future fusion)."
+        name: "YOLOv3"
+        full_name: "YOLOv3"
+        description: "**YOLOv3** is a real-time, single-stage object detection model that builds on [YOLOv2](https://paperswithcode.com/method/yolov2) with several improvements. Improvements include the use of a new backbone network, Darknet-53 that utilises residual connections, or in the words of the author, \"those newfangled residual network stuff\", as well as some improvements to the bounding box prediction step, and use of three different scales from which to extract features (similar to an FPN)."
       }
       methods: {
-        name: "ResNeXt Block"
-        full_name: "ResNeXt Block"
-        description: "A **ResNeXt Block** is a type of residual block used as part of the [ResNeXt](https://paperswithcode.com/method/resnext) CNN architecture. It uses a \"split-transform-merge\" strategy (branched paths within a single module) similar to an [Inception module](https://paperswithcode.com/method/inception-module), i.e. it aggregates a set of transformations. Compared to a Residual Block, it exposes a new dimension,  *cardinality* (size of set of transformations) $C$, as an essential factor in addition to depth and width. \r\n\r\nFormally, a set of aggregated transformations can be represented as: $\\mathcal{F}(x)=\\sum_{i=1}^{C}\\mathcal{T}_i(x)$, where $\\mathcal{T}_i(x)$ can be an arbitrary function. Analogous to a simple neuron, $\\mathcal{T}_i$ should project $x$ into an (optionally low-dimensional) embedding and then transform it."
+        name: "RoIAlign"
+        full_name: "RoIAlign"
+        description: "**Region of Interest Align**, or **RoIAlign**, is an operation for extracting a small feature map from each RoI in detection and segmentation based tasks. It removes the harsh quantization of [RoI Pool](https://paperswithcode.com/method/roi-pooling), properly *aligning* the extracted features with the input. To avoid any quantization of the RoI boundaries or bins (using $x/16$ instead of $[x/16]$), RoIAlign uses bilinear interpolation to compute the exact values of the input features at four regularly sampled locations in each RoI bin, and the result is then aggregated (using max or average)."
       }
       methods: {
-        name: "Darknet-53"
-        full_name: "Darknet-53"
-        description: "**Darknet-53** is a convolutional neural network that acts as a backbone for the [YOLOv3](https://paperswithcode.com/method/yolov3) object detection approach. The improvements upon its predecessor [Darknet-19](https://paperswithcode.com/method/darknet-19) include the use of residual connections, as well as more layers."
+        name: "Depthwise Separable Convolution"
+        full_name: "Depthwise Separable Convolution"
+        description: "While [standard convolution](https://paperswithcode.com/method/convolution) performs the channelwise and spatial-wise computation in one step, **Depthwise Separable Convolution**  splits the computation into two steps: depthwise convolution applies a single convolutional filter per each input channel and pointwise convolution is used to create a linear combination of the output of the depthwise convolution. The comparison of standard convolution and depthwise separable convolution is shown to the right.\r\n\r\nCredit: [Depthwise Convolution Is All You Need for Learning Multiple Visual Domains](https://paperswithcode.com/paper/depthwise-convolution-is-all-you-need-for)"
       }
     }
     papers: {
@@ -43394,7 +43394,7 @@ pr_id_to_video: {
     video: {
       video_id: "zkeh7Tt9tYQ"
       video_title: "PR-302: NeRF: Representing Scenes as Neural Radiance Fields for View Synthesis"
-      number_of_likes: 28
+      number_of_likes: 27
       number_of_views: 1168
       published_date: {
         seconds: 1615203796
@@ -44857,6 +44857,11 @@ pr_id_to_video: {
       authors: "Kurt Shuster"
       authors: "Y-Lan Boureau"
       authors: "Jason Weston"
+      methods: {
+        name: "BPE"
+        full_name: "Byte Pair Encoding"
+        description: "**Byte Pair Encoding**, or **BPE**, is a subword segmentation algorithm that encodes rare and unknown words as sequences of subword units. The intuition is that various word classes are translatable via smaller units than words, for instance names (via character copying or transliteration), compounds (via compositional translation), and cognates and loanwords (via phonological and morphological transformations).\r\n\r\n[Lei Mao](https://leimao.github.io/blog/Byte-Pair-Encoding/) has a detailed blog post that explains how this works."
+      }
       methods: {
         name: "Multi-Head Attention"
         full_name: "Multi-Head Attention"
@@ -44877,11 +44882,6 @@ pr_id_to_video: {
         full_name: "Adam"
         description: "**Adam** is an adaptive learning rate optimization algorithm that utilises both momentum and scaling, combining the benefits of [RMSProp](https://paperswithcode.com/method/rmsprop) and [SGD w/th Momentum](https://paperswithcode.com/method/sgd-with-momentum). The optimizer is designed to be appropriate for non-stationary objectives and problems with very noisy and/or sparse gradients. \r\n\r\nThe weight updates are performed as:\r\n\r\n$$ w_{t} = w_{t-1} - \\eta\\frac{\\hat{m}\\_{t}}{\\sqrt{\\hat{v}\\_{t}} + \\epsilon}  $$\r\n\r\nwith\r\n\r\n$$ \\hat{m}\\_{t} = \\frac{m_{t}}{1-\\beta^{t}_{1}} $$\r\n\r\n$$ \\hat{v}\\_{t} = \\frac{v_{t}}{1-\\beta^{t}_{2}} $$\r\n\r\n$$ m_{t} = \\beta_{1}m_{t-1} + (1-\\beta_{1})g_{t} $$\r\n\r\n$$ v_{t} = \\beta_{2}v_{t-1} + (1-\\beta_{2})g_{t}^{2}  $$\r\n\r\n\r\n$ \\eta $ is the step size/learning rate, around 1e-3 in the original paper. $ \\epsilon $ is a small number, typically 1e-8 or 1e-10, to prevent dividing by zero. $ \\beta_{1} $ and $ \\beta_{2} $ are forgetting parameters, with typical values 0.9 and 0.999, respectively."
       }
-      methods: {
-        name: "BPE"
-        full_name: "Byte Pair Encoding"
-        description: "**Byte Pair Encoding**, or **BPE**, is a subword segmentation algorithm that encodes rare and unknown words as sequences of subword units. The intuition is that various word classes are translatable via smaller units than words, for instance names (via character copying or transliteration), compounds (via compositional translation), and cognates and loanwords (via phonological and morphological transformations).\r\n\r\n[Lei Mao](https://leimao.github.io/blog/Byte-Pair-Encoding/) has a detailed blog post that explains how this works."
-      }
       methods: {
         name: "Dropout"
         full_name: "Dropout"
@@ -50508,3 +50508,205 @@ pr_id_to_video: {
     }
   }
 }
+pr_id_to_video: {
+  key: 330
+  value: {
+    pr_id: 330
+    papers: {
+      paper_id: "how-to-train-your-vit-data-augmentation-and"
+      title: "How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers"
+      arxiv_id: "2106.10270"
+      abstract: "Vision Transformers (ViT) have been shown to attain highly competitive performance for a wide range of vision applications, such as image classification, object detection and semantic image segmentation. In comparison to convolutional neural networks, the Vision Transformer's weaker inductive bias is generally found to cause an increased reliance on model regularization or data augmentation (``AugReg'' for short) when training on smaller training datasets. We conduct a systematic empirical study in order to better understand the interplay between the amount of training data, AugReg, model size and compute budget. As one result of this study we find that the combination of increased compute and AugReg can yield models with the same performance as models trained on an order of magnitude more training data: we train ViT models of various sizes on the public ImageNet-21k dataset which either match or outperform their counterparts trained on the larger, but not publicly available JFT-300M dataset."
+      published_date: {
+        seconds: 1623974400
+      }
+      authors: "Andreas Steiner"
+      authors: "Alexander Kolesnikov"
+      authors: "Xiaohua Zhai"
+      authors: "Ross Wightman"
+      authors: "Jakob Uszkoreit"
+      authors: "Lucas Beyer"
+      repositories: {
+        url: "https://github.com/rstrudel/segmenter"
+        owner: "rstrudel"
+        framework: FRAMEWORK_PYTORCH
+        number_of_stars: 172
+        description: "Official PyTorch implementation of Segmenter: Transformer for Semantic Segmentation"
+      }
+      repositories: {
+        is_official: true
+        url: "https://github.com/google-research/vision_transformer"
+        owner: "google-research"
+        framework: FRAMEWORK_OTHERS
+        number_of_stars: 3099
+      }
+      repositories: {
+        is_official: true
+        url: "https://github.com/rwightman/pytorch-image-models"
+        owner: "rwightman"
+        framework: FRAMEWORK_PYTORCH
+        number_of_stars: 11589
+        description: "PyTorch image models, scripts, pretrained weights -- ResNet, ResNeXT, EfficientNet, EfficientNetV2, NFNet, Vision Transformer, MixNet, MobileNet-V3/V2, RegNet, DPN, CSPNet, and more"
+      }
+    }
+    papers: {
+      paper_id: "how-to-train-your-vit-data-augmentation-and"
+      title: "How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers"
+      arxiv_id: "2106.10270"
+      abstract: "Vision Transformers (ViT) have been shown to attain highly competitive performance for a wide range of vision applications, such as image classification, object detection and semantic image segmentation. In comparison to convolutional neural networks, the Vision Transformer's weaker inductive bias is generally found to cause an increased reliance on model regularization or data augmentation (``AugReg'' for short) when training on smaller training datasets. We conduct a systematic empirical study in order to better understand the interplay between the amount of training data, AugReg, model size and compute budget. As one result of this study we find that the combination of increased compute and AugReg can yield models with the same performance as models trained on an order of magnitude more training data: we train ViT models of various sizes on the public ImageNet-21k dataset which either match or outperform their counterparts trained on the larger, but not publicly available JFT-300M dataset."
+      published_date: {
+        seconds: 1623974400
+      }
+      authors: "Andreas Steiner"
+      authors: "Alexander Kolesnikov"
+      authors: "Xiaohua Zhai"
+      authors: "Ross Wightman"
+      authors: "Jakob Uszkoreit"
+      authors: "Lucas Beyer"
+      repositories: {
+        url: "https://github.com/rstrudel/segmenter"
+        owner: "rstrudel"
+        framework: FRAMEWORK_PYTORCH
+        number_of_stars: 172
+        description: "Official PyTorch implementation of Segmenter: Transformer for Semantic Segmentation"
+      }
+      repositories: {
+        is_official: true
+        url: "https://github.com/google-research/vision_transformer"
+        owner: "google-research"
+        framework: FRAMEWORK_OTHERS
+        number_of_stars: 3099
+      }
+      repositories: {
+        is_official: true
+        url: "https://github.com/rwightman/pytorch-image-models"
+        owner: "rwightman"
+        framework: FRAMEWORK_PYTORCH
+        number_of_stars: 11589
+        description: "PyTorch image models, scripts, pretrained weights -- ResNet, ResNeXT, EfficientNet, EfficientNetV2, NFNet, Vision Transformer, MixNet, MobileNet-V3/V2, RegNet, DPN, CSPNet, and more"
+      }
+    }
+    papers: {
+      paper_id: "training-data-efficient-image-transformers"
+      title: "Training data-efficient image transformers & distillation through attention"
+      arxiv_id: "2012.12877"
+      abstract: "Recently, neural networks purely based on attention were shown to address image understanding tasks such as image classification. However, these visual transformers are pre-trained with hundreds of millions of images using an expensive infrastructure, thereby limiting their adoption. In this work, we produce a competitive convolution-free transformer by training on Imagenet only. We train them on a single computer in less than 3 days. Our reference vision transformer (86M parameters) achieves top-1 accuracy of 83.1% (single-crop evaluation) on ImageNet with no external data. More importantly, we introduce a teacher-student strategy specific to transformers. It relies on a distillation token ensuring that the student learns from the teacher through attention. We show the interest of this token-based distillation, especially when using a convnet as a teacher. This leads us to report results competitive with convnets for both Imagenet (where we obtain up to 85.2% accuracy) and when transferring to other tasks. We share our code and models."
+      published_date: {
+        seconds: 1608681600
+      }
+      authors: "Hugo Touvron"
+      authors: "Matthieu Cord"
+      authors: "Matthijs Douze"
+      authors: "Francisco Massa"
+      authors: "Alexandre Sablayrolles"
+      authors: "Hervé Jégou"
+      repositories: {
+        url: "https://github.com/PaddlePaddle/PaddleClas"
+        owner: "PaddlePaddle"
+        framework: FRAMEWORK_OTHERS
+        number_of_stars: 2085
+        description: "A treasure chest for visual recognition powered by PaddlePaddle"
+      }
+      repositories: {
+        url: "https://github.com/bshantam97/Attention_Based_Networks"
+        owner: "bshantam97"
+        framework: FRAMEWORK_PYTORCH
+      }
+      repositories: {
+        url: "https://github.com/tianhai123/vit-pytorch"
+        owner: "tianhai123"
+        framework: FRAMEWORK_PYTORCH
+        number_of_stars: 2
+      }
+      repositories: {
+        url: "https://github.com/lucidrains/vit-pytorch"
+        owner: "lucidrains"
+        framework: FRAMEWORK_PYTORCH
+        number_of_stars: 5023
+        description: "Implementation of Vision Transformer, a simple way to achieve SOTA in vision classification with only a single transformer encoder, in Pytorch"
+      }
+      repositories: {
+        url: "https://github.com/TACJu/TransFG"
+        owner: "TACJu"
+        framework: FRAMEWORK_PYTORCH
+        number_of_stars: 117
+        description: "This is the official PyTorch implementation of the paper \"TransFG: A Transformer Architecture for Fine-grained Recognition\" (Ju He, Jie-Neng Chen, Shuai Liu, Adam Kortylewski, Cheng Yang, Yutong Bai, Changhu Wang, Alan Yuille)."
+      }
+      repositories: {
+        is_official: true
+        url: "https://github.com/facebookresearch/deit"
+        owner: "facebookresearch"
+        framework: FRAMEWORK_PYTORCH
+        number_of_stars: 1967
+        description: "Official DeiT repository"
+      }
+      repositories: {
+        url: "https://github.com/UdbhavPrasad072300/Transformer-Implementations"
+        owner: "UdbhavPrasad072300"
+        framework: FRAMEWORK_PYTORCH
+        number_of_stars: 17
+        description: "Library - Vanilla, ViT, DeiT, BERT, GPT"
+      }
+      methods: {
+        name: "Attention Dropout"
+        full_name: "Attention Dropout"
+        description: "**Attention Dropout** is a type of dropout used in attention-based architectures, where elements are randomly dropped out of the softmax in the attention equation. For example, for scaled-dot product attention, we would drop elements from the first term:\r\n\r\n$$ {\\text{Attention}}(Q, K, V) = \\text{softmax}\\left(\\frac{QK^{T}}{\\sqrt{d_k}}\\right)V $$"
+      }
+      methods: {
+        name: "CutMix"
+        full_name: "CutMix"
+        description: "**CutMix** is an image data augmentation strategy. Instead of simply removing pixels as in Cutout, we replace the removed regions with a patch from another image. The ground truth labels are also mixed proportionally to the number of pixels of combined images. The added patches further enhance localization ability by requiring the model to identify the object from a partial view."
+      }
+      methods: {
+        name: "RandAugment"
+        full_name: "RandAugment"
+        description: "**RandAugment** is an automated data augmentation method. The search space for data augmentation has 2 interpretable hyperparameter $N$ and $M$.  $N$ is the number of augmentation transformations to apply sequentially, and $M$ is the magnitude for all the transformations. To reduce the parameter space but still maintain image diversity, learned policies and probabilities for applying each transformation are replaced with a parameter-free procedure of always selecting a transformation with uniform probability $\\frac{1}{K}$. Here $K$ is the number of transformation options. So given $N$ transformations for a training image, RandAugment may thus express $KN$ potential policies.\r\n\r\nTransformations applied include identity transformation, autoContrast, equalize, rotation, solarixation, colorjittering, posterizing, changing contrast, changing brightness, changing sharpness, shear-x, shear-y, translate-x, translate-y."
+      }
+      methods: {
+        name: "Mixup"
+        full_name: "Mixup"
+        description: "**Mixup** is a data augmentation technique that that generates a weighted combinations of random image pairs from the training data. Given two images and their ground truth labels: $\\left(x\\_{i}, y\\_{i}\\right), \\left(x\\_{j}, y\\_{j}\\right)$, a synthetic training example $\\left(\\hat{x}, \\hat{y}\\right)$ is generated as:\r\n\r\n$$ \\hat{x} = \\lambda{x\\_{i}} + \\left(1 − \\lambda\\right){x\\_{j}} $$\r\n$$ \\hat{y} = \\lambda{y\\_{i}} + \\left(1 − \\lambda\\right){y\\_{j}} $$\r\n\r\nwhere $\\lambda \\sim \\text{Beta}\\left(\\alpha = 0.2\\right)$ is independently sampled for each augmented example."
+      }
+      methods: {
+        name: "Dense Connections"
+        full_name: "Dense Connections"
+        description: "**Dense Connections**, or **Fully Connected Connections**, are a type of layer in a deep neural network that use a linear operation where every input is connected to every output by a weight. This means there are $n\\_{\\text{inputs}}*n\\_{\\text{outputs}}$ parameters, which can lead to a lot of parameters for a sizeable network.\r\n\r\n$$h\\_{l} = g\\left(\\textbf{W}^{T}h\\_{l-1}\\right)$$\r\n\r\nwhere $g$ is an activation function.\r\n\r\nImage Source: Deep Learning by Goodfellow, Bengio and Courville"
+      }
+      methods: {
+        name: "Label Smoothing"
+        full_name: "Label Smoothing"
+        description: "**Label Smoothing** is a regularization technique that introduces noise for the labels. This accounts for the fact that datasets may have mistakes in them, so maximizing the likelihood of $\\log{p}\\left(y\\mid{x}\\right)$ directly can be harmful. Assume for a small constant $\\epsilon$, the training set label $y$ is correct with probability $1-\\epsilon$ and incorrect otherwise. Label Smoothing regularizes a model based on a softmax with $k$ output values by replacing the hard $0$ and $1$ classification targets with targets of $\\frac{\\epsilon}{k-1}$ and $1-\\epsilon$ respectively.\r\n\r\nSource: Deep Learning, Goodfellow et al\r\n\r\nImage Source: [When Does Label Smoothing Help?](https://arxiv.org/abs/1906.02629)"
+      }
+      methods: {
+        name: "Multi-Head Attention"
+        full_name: "Multi-Head Attention"
+        description: "**Multi-head Attention** is a module for attention mechanisms which runs through an attention mechanism several times in parallel. The independent attention outputs are then concatenated and linearly transformed into the expected dimension. Intuitively, multiple attention heads allows for attending to parts of the sequence differently (e.g. longer-term dependencies versus shorter-term dependencies). \r\n\r\n$$ \\text{MultiHead}\\left(\\textbf{Q}, \\textbf{K}, \\textbf{V}\\right) = \\left[\\text{head}\\_{1},\\dots,\\text{head}\\_{h}\\right]\\textbf{W}_{0}$$\r\n\r\n$$\\text{where} \\text{ head}\\_{i} = \\text{Attention} \\left(\\textbf{Q}\\textbf{W}\\_{i}^{Q}, \\textbf{K}\\textbf{W}\\_{i}^{K}, \\textbf{V}\\textbf{W}\\_{i}^{V} \\right) $$\r\n\r\nAbove $\\textbf{W}$ are all learnable parameter matrices.\r\n\r\nNote that [scaled dot-product attention](https://paperswithcode.com/method/scaled) is most commonly used in this module, although in principle it can be swapped out for other types of attention mechanism.\r\n\r\nSource: [Lilian Weng](https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html#a-family-of-attention-mechanisms)"
+      }
+      methods: {
+        name: "Feedforward Network"
+        full_name: "Feedforward Network"
+        description: "A **Feedforward Network**, or a **Multilayer Perceptron (MLP)**, is a neural network with solely densely connected layers. This is the classic neural network architecture of the literature. It consists of inputs $x$ passed through units $h$ (of which there can be many layers) to predict a target $y$. Activation functions are generally chosen to be non-linear to allow for flexible functional approximation.\r\n\r\nImage Source: Deep Learning, Goodfellow et al"
+      }
+      methods: {
+        name: "FixRes"
+        full_name: "FixRes"
+        description: "**FixRes** is an image scaling strategy that seeks to optimize classifier performance. It is motivated by the observation that data augmentations induce a significant discrepancy between the size of the objects seen by the classifier at train and test time: in fact, a lower train resolution improves the classification at test time! FixRes is a simple strategy to optimize the classifier performance, that employs different train and test resolutions. The calibrations are: (a) calibrating the object sizes by adjusting the crop size and (b) adjusting statistics before spatial pooling."
+      }
+      methods: {
+        name: "Scaled Dot-Product Attention"
+        full_name: "Scaled Dot-Product Attention"
+        description: "**Scaled dot-product attention** is an attention mechanism where the dot products are scaled down by $\\sqrt{d_k}$. Formally we have a query $Q$, a key $K$ and a value $V$ and calculate the attention as:\r\n\r\n$$ {\\text{Attention}}(Q, K, V) = \\text{softmax}(\\frac{QK^{T}}{\\sqrt{d_k}})V $$\r\n\r\nIf we assume that $q$ and $k$ are $d_k$-dimensional vectors whose components are independent random variables with mean $0$ and variance $1$, then their dot product, $q \\cdot k = \\sum_{i=1}^{d_k} u_iv_i$, has mean $0$ and variance $d_k$.  Since we would prefer these values to have variance $1$, we divide by $\\sqrt{d_k}$."
+      }
+    }
+    video: {
+      video_id: "A3RrAIx-KCc"
+      video_title: "PR-330: How To Train Your ViT? Data, Augmentation, and Regularization in Vision Transformers"
+      number_of_likes: 20
+      number_of_views: 404
+      published_date: {
+        seconds: 1626015401
+      }
+      uploader: "JinWon Lee"
+    }
+  }
+}