From db93f4609adc2b2de5a2da3fff94c8ce5f8dfd87 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 15 Feb 2021 17:10:13 +0000 Subject: [PATCH 01/92] Early skeleton of API. --- torchvision/models/detection/ssd.py | 67 +++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 torchvision/models/detection/ssd.py diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py new file mode 100644 index 00000000000..12c7d244c2d --- /dev/null +++ b/torchvision/models/detection/ssd.py @@ -0,0 +1,67 @@ +from torch import nn, Tensor + +from typing import Dict, List, Optional, Tuple + +__all__ = ['SSD'] + + +class SSDHead(nn.Module): + # TODO: Similar to RetinaNetHead. Perhaps abstract and reuse for one-shot detectors. + def __init__(self, in_channels, num_anchors, num_classes): + super().__init__() + self.classification_head = SSDClassificationHead(in_channels, num_anchors, num_classes) + self.regression_head = SSDRegressionHead(in_channels, num_anchors) + + def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], anchors: List[Tensor], + matched_idxs: List[Tensor]) -> Dict[str, Tensor]: + return { + 'classification': self.classification_head.compute_loss(targets, head_outputs, matched_idxs), + 'bbox_regression': self.regression_head.compute_loss(targets, head_outputs, anchors, matched_idxs), + } + + def forward(self, x: List[Tensor]) -> Dict[str, Tensor]: + return { + 'cls_logits': self.classification_head(x), + 'bbox_regression': self.regression_head(x) + } + + +class SSDClassificationHead(nn.Module): + def __init__(self, in_channels, num_anchors, num_classes): + super().__init__() + + def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], + matched_idxs: List[Tensor]) -> Tensor: + pass + + def forward(self, x: List[Tensor]) -> Tensor: + pass + + +class SSDRegressionHead(nn.Module): + def __init__(self, in_channels, num_anchors): + super().__init__() + + def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], anchors: List[Tensor], + matched_idxs: List[Tensor]) -> Tensor: + pass + + def forward(self, x: List[Tensor]) -> Tensor: + pass + + +class SSD(nn.Module): + def __init__(self, backbone, num_classes): + super().__init__() + + def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], + anchors: List[Tensor]) -> Dict[str, Tensor]: + pass + + def postprocess_detections(self, head_outputs: Dict[str, List[Tensor]], anchors: List[List[Tensor]], + image_shapes: List[Tuple[int, int]]) -> List[Dict[str, Tensor]]: + pass + + def forward(self, images: List[Tensor], + targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]: + pass From b2e42bbe403c5ac9e9b4358072a6260bd70e92f1 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Sat, 6 Mar 2021 17:22:35 +0000 Subject: [PATCH 02/92] Adding MultiFeatureMap and vgg16 backbone. --- torchvision/models/detection/ssd.py | 103 +++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 2 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 12c7d244c2d..959b4ee7625 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -1,7 +1,10 @@ from torch import nn, Tensor - from typing import Dict, List, Optional, Tuple +from .backbone_utils import _validate_trainable_layers +from .. import vgg + + __all__ = ['SSD'] @@ -51,8 +54,11 @@ def forward(self, x: List[Tensor]) -> Tensor: class SSD(nn.Module): - def __init__(self, backbone, num_classes): + def __init__(self, backbone, num_classes, num_anchors=(4, 6, 6, 6, 4, 4)): super().__init__() + self.backbone = backbone + self.num_classes = num_classes + self.num_anchors = num_anchors def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], anchors: List[Tensor]) -> Dict[str, Tensor]: @@ -65,3 +71,96 @@ def postprocess_detections(self, head_outputs: Dict[str, List[Tensor]], anchors: def forward(self, images: List[Tensor], targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]: pass + + +class MultiFeatureMap(nn.Module): + + def __init__(self, feature_maps: nn.ModuleList): + super().__init__() + self.feature_maps = feature_maps + + def forward(self, x): + output = [] + for block in self.feature_maps: + x = block(x) + output.append(x) + return output + + +def vgg16_mfm_backbone(pretrained, trainable_layers=3): + backbone = vgg.vgg16(pretrained=pretrained).features + + # Gather the indices of maxpools. These are the locations of output blocks. + stage_indices = [i for i, b in enumerate(backbone) if isinstance(b, nn.MaxPool2d)] + num_stages = len(stage_indices) + + # find the index of the layer from which we wont freeze + assert 0 <= trainable_layers <= num_stages + freeze_before = num_stages if trainable_layers == 0 else stage_indices[num_stages - trainable_layers] + + for b in backbone[:freeze_before]: + for parameter in b.parameters(): + parameter.requires_grad_(False) + + # Multiple Feature map definition - page 4, Fig 2 of SSD paper + feature_maps = nn.ModuleList([ + # Conv4_3 map + nn.Sequential( + *backbone[:23], # until conv4_3 + ), + # FC7 map + nn.Sequential( + *backbone[23:], # until maxpool5 # TODO: replace maxpool 5 as in the paper? + nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=1), # FC6 + nn.ReLU(inplace=True), + nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1), # FC7 + nn.ReLU(inplace=True) + ), + # Conv8_2 map + nn.Sequential( + nn.Conv2d(1024, 256, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2), + nn.ReLU(inplace=True), + ), + # Conv9_2 map + nn.Sequential( + nn.Conv2d(512, 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2), + nn.ReLU(inplace=True), + ), + # Conv10_2 map + nn.Sequential( + nn.Conv2d(256, 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + ), + # Conv11_2 map + nn.Sequential( + nn.Conv2d(256, 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + ), + ]) + # TODO: keep track of block output sizes in a variable. Perhaps define a new block class that has it as attribute? + + return MultiFeatureMap(feature_maps) + + +def ssd_vgg16(pretrained=False, progress=True, + num_classes=91, pretrained_backbone=True, trainable_backbone_layers=None, **kwargs): + trainable_backbone_layers = _validate_trainable_layers( + pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3) + + if pretrained: + # no need to download the backbone if pretrained is set + pretrained_backbone = False + + backbone = vgg16_mfm_backbone(pretrained_backbone, trainable_layers=trainable_backbone_layers) + model = SSD(backbone, num_classes, **kwargs) + if pretrained: + pass # TODO: load pre-trained COCO weights + return model From 977932474c21f61db847771d4b4909cbad9c926e Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Sun, 7 Mar 2021 15:53:25 +0000 Subject: [PATCH 03/92] Making vgg16 backbone same as paper. --- torchvision/models/detection/ssd.py | 89 +++++++++++++++++++---------- 1 file changed, 58 insertions(+), 31 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 959b4ee7625..4b8e36cc21f 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -87,7 +87,7 @@ def forward(self, x): return output -def vgg16_mfm_backbone(pretrained, trainable_layers=3): +def _vgg16_mfm_backbone(pretrained, trainable_layers=3): backbone = vgg.vgg16(pretrained=pretrained).features # Gather the indices of maxpools. These are the locations of output blocks. @@ -102,50 +102,77 @@ def vgg16_mfm_backbone(pretrained, trainable_layers=3): for parameter in b.parameters(): parameter.requires_grad_(False) + # Patch ceil_mode for all maxpool layers of backbone to get the same outputs as Fig2 of SSD papers + for layer in backbone: + if isinstance(layer, nn.MaxPool2d): + layer.ceil_mode = True + # Multiple Feature map definition - page 4, Fig 2 of SSD paper + def build_feature_map_block(layers, out_channels): + block = nn.Sequential(*layers) + block.out_channels = out_channels + return block + feature_maps = nn.ModuleList([ # Conv4_3 map - nn.Sequential( - *backbone[:23], # until conv4_3 + build_feature_map_block( + backbone[:23], # until conv4_3 + # TODO: add L2 nomarlization + scaling? + 512 ), # FC7 map - nn.Sequential( - *backbone[23:], # until maxpool5 # TODO: replace maxpool 5 as in the paper? - nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=1), # FC6 - nn.ReLU(inplace=True), - nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1), # FC7 - nn.ReLU(inplace=True) + build_feature_map_block( + ( + *backbone[23:-1], # until conv5_3 + nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=True), # modified maxpool5 + nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6), # FC6 with atrous + nn.ReLU(inplace=True), + nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1), # FC7 + nn.ReLU(inplace=True) + ), + 1024 ), # Conv8_2 map - nn.Sequential( - nn.Conv2d(1024, 256, kernel_size=1), - nn.ReLU(inplace=True), - nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2), - nn.ReLU(inplace=True), + build_feature_map_block( + ( + nn.Conv2d(1024, 256, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2), + nn.ReLU(inplace=True), + ), + 512, ), # Conv9_2 map - nn.Sequential( - nn.Conv2d(512, 128, kernel_size=1), - nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2), - nn.ReLU(inplace=True), + build_feature_map_block( + ( + nn.Conv2d(512, 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2), + nn.ReLU(inplace=True), + ), + 256, ), # Conv10_2 map - nn.Sequential( - nn.Conv2d(256, 128, kernel_size=1), - nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=3, padding=1), - nn.ReLU(inplace=True), + build_feature_map_block( + ( + nn.Conv2d(256, 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=3), + nn.ReLU(inplace=True), + ), + 256, ), # Conv11_2 map - nn.Sequential( - nn.Conv2d(256, 128, kernel_size=1), - nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=3, padding=1), - nn.ReLU(inplace=True), + build_feature_map_block( + ( + nn.Conv2d(256, 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=3), + nn.ReLU(inplace=True), + ), + 256, ), ]) - # TODO: keep track of block output sizes in a variable. Perhaps define a new block class that has it as attribute? return MultiFeatureMap(feature_maps) @@ -159,7 +186,7 @@ def ssd_vgg16(pretrained=False, progress=True, # no need to download the backbone if pretrained is set pretrained_backbone = False - backbone = vgg16_mfm_backbone(pretrained_backbone, trainable_layers=trainable_backbone_layers) + backbone = _vgg16_mfm_backbone(pretrained_backbone, trainable_layers=trainable_backbone_layers) model = SSD(backbone, num_classes, **kwargs) if pretrained: pass # TODO: load pre-trained COCO weights From bffe4bc896fdcf42418a05e262eba20d840139d8 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 8 Mar 2021 13:30:07 +0000 Subject: [PATCH 04/92] Making code generic to support all vggs. --- torchvision/models/detection/ssd.py | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 4b8e36cc21f..a82333bf03e 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -87,8 +87,8 @@ def forward(self, x): return output -def _vgg16_mfm_backbone(pretrained, trainable_layers=3): - backbone = vgg.vgg16(pretrained=pretrained).features +def _vgg_mfm_backbone(backbone_name, pretrained, trainable_layers=3): + backbone = vgg.__dict__[backbone_name](pretrained=pretrained).features # Gather the indices of maxpools. These are the locations of output blocks. stage_indices = [i for i, b in enumerate(backbone) if isinstance(b, nn.MaxPool2d)] @@ -113,18 +113,17 @@ def build_feature_map_block(layers, out_channels): block.out_channels = out_channels return block + penultimate_block_index = stage_indices[-2] feature_maps = nn.ModuleList([ - # Conv4_3 map build_feature_map_block( - backbone[:23], # until conv4_3 + backbone[:penultimate_block_index], # until conv4_3 # TODO: add L2 nomarlization + scaling? 512 ), - # FC7 map build_feature_map_block( ( - *backbone[23:-1], # until conv5_3 - nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=True), # modified maxpool5 + *backbone[penultimate_block_index:-1], # until conv5_3, skip last maxpool + nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=True), # add modified maxpool5 nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6), # FC6 with atrous nn.ReLU(inplace=True), nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1), # FC7 @@ -132,42 +131,38 @@ def build_feature_map_block(layers, out_channels): ), 1024 ), - # Conv8_2 map build_feature_map_block( ( nn.Conv2d(1024, 256, kernel_size=1), nn.ReLU(inplace=True), - nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2), + nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2), # conv8_2 nn.ReLU(inplace=True), ), 512, ), - # Conv9_2 map build_feature_map_block( ( nn.Conv2d(512, 128, kernel_size=1), nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2), + nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2), # conv9_2 nn.ReLU(inplace=True), ), 256, ), - # Conv10_2 map build_feature_map_block( ( nn.Conv2d(256, 128, kernel_size=1), nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=3), + nn.Conv2d(128, 256, kernel_size=3), # conv10_2 nn.ReLU(inplace=True), ), 256, ), - # Conv11_2 map build_feature_map_block( ( nn.Conv2d(256, 128, kernel_size=1), nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=3), + nn.Conv2d(128, 256, kernel_size=3), # conv11_2 nn.ReLU(inplace=True), ), 256, @@ -186,7 +181,7 @@ def ssd_vgg16(pretrained=False, progress=True, # no need to download the backbone if pretrained is set pretrained_backbone = False - backbone = _vgg16_mfm_backbone(pretrained_backbone, trainable_layers=trainable_backbone_layers) + backbone = _vgg_mfm_backbone("vgg16", pretrained_backbone, trainable_layers=trainable_backbone_layers) model = SSD(backbone, num_classes, **kwargs) if pretrained: pass # TODO: load pre-trained COCO weights From eced9f022f2021e75eae6ca24b6e311ebf4adc54 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 8 Mar 2021 15:56:08 +0000 Subject: [PATCH 05/92] Moving vgg's extra layers a separate class + L2 scaling. --- torchvision/models/detection/ssd.py | 138 +++++++++++++--------------- 1 file changed, 65 insertions(+), 73 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index a82333bf03e..3d3dd3d7fba 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -1,3 +1,6 @@ +import torch +import torch.nn.functional as F + from torch import nn, Tensor from typing import Dict, List, Optional, Tuple @@ -73,17 +76,73 @@ def forward(self, images: List[Tensor], pass -class MultiFeatureMap(nn.Module): +class SSDFeatureExtractorVGG(nn.Module): + + OUT_CHANNELS = (512, 1024, 512, 256, 256, 256) - def __init__(self, feature_maps: nn.ModuleList): + def __init__(self, backbone: nn.Module): super().__init__() - self.feature_maps = feature_maps + + # Patch ceil_mode for all maxpool layers of backbone to get the same WxH output sizes as the paper + penultimate_block_pos = ultimate_block_pos = None + for i, layer in enumerate(backbone): + if isinstance(layer, nn.MaxPool2d): + layer.ceil_mode = True + penultimate_block_pos = ultimate_block_pos + ultimate_block_pos = i + + # parameters used for L2 regularization + rescaling + self.scale_weight = nn.Parameter(torch.ones(self.OUT_CHANNELS[0]) * 20) + + # Multiple Feature maps - page 4, Fig 2 of SSD paper + self.block1 = nn.Sequential( + *backbone[:penultimate_block_pos] # until conv4_3 + ) + self.block2 = nn.Sequential( + *backbone[penultimate_block_pos:-1], # until conv5_3, skip maxpool5 + nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=True), # add modified maxpool5 + nn.Conv2d(in_channels=self.OUT_CHANNELS[0], + out_channels=1024, kernel_size=3, padding=6, dilation=6), # FC6 with atrous + nn.ReLU(inplace=True), + nn.Conv2d(in_channels=1024, out_channels=self.OUT_CHANNELS[1], kernel_size=1), # FC7 + nn.ReLU(inplace=True) + ) + self.block3 = nn.Sequential( + nn.Conv2d(self.OUT_CHANNELS[1], 256, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, self.OUT_CHANNELS[2], kernel_size=3, padding=1, stride=2), # conv8_2 + nn.ReLU(inplace=True), + ) + self.block4 = nn.Sequential( + nn.Conv2d(self.OUT_CHANNELS[2], 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, self.OUT_CHANNELS[3], kernel_size=3, padding=1, stride=2), # conv9_2 + nn.ReLU(inplace=True), + ) + self.block5 = nn.Sequential( + nn.Conv2d(self.OUT_CHANNELS[3], 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, self.OUT_CHANNELS[4], kernel_size=3), # conv10_2 + nn.ReLU(inplace=True), + ) + self.block6 = nn.Sequential( + nn.Conv2d(self.OUT_CHANNELS[4], 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, self.OUT_CHANNELS[5], kernel_size=3), # conv11_2 + nn.ReLU(inplace=True), + ) def forward(self, x): - output = [] - for block in self.feature_maps: + # L2 regularization + Rescaling of 1st block's feature map + x = self.block1(x) + rescaled = self.scale_weight.view(1, -1, 1, 1) * F.normalize(x) + output = [rescaled] + + # Calculating Feature maps for the rest blocks + for block in (self.block2, self.block3, self.block4, self.block5, self.block6): x = block(x) output.append(x) + return output @@ -102,74 +161,7 @@ def _vgg_mfm_backbone(backbone_name, pretrained, trainable_layers=3): for parameter in b.parameters(): parameter.requires_grad_(False) - # Patch ceil_mode for all maxpool layers of backbone to get the same outputs as Fig2 of SSD papers - for layer in backbone: - if isinstance(layer, nn.MaxPool2d): - layer.ceil_mode = True - - # Multiple Feature map definition - page 4, Fig 2 of SSD paper - def build_feature_map_block(layers, out_channels): - block = nn.Sequential(*layers) - block.out_channels = out_channels - return block - - penultimate_block_index = stage_indices[-2] - feature_maps = nn.ModuleList([ - build_feature_map_block( - backbone[:penultimate_block_index], # until conv4_3 - # TODO: add L2 nomarlization + scaling? - 512 - ), - build_feature_map_block( - ( - *backbone[penultimate_block_index:-1], # until conv5_3, skip last maxpool - nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=True), # add modified maxpool5 - nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6), # FC6 with atrous - nn.ReLU(inplace=True), - nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1), # FC7 - nn.ReLU(inplace=True) - ), - 1024 - ), - build_feature_map_block( - ( - nn.Conv2d(1024, 256, kernel_size=1), - nn.ReLU(inplace=True), - nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2), # conv8_2 - nn.ReLU(inplace=True), - ), - 512, - ), - build_feature_map_block( - ( - nn.Conv2d(512, 128, kernel_size=1), - nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2), # conv9_2 - nn.ReLU(inplace=True), - ), - 256, - ), - build_feature_map_block( - ( - nn.Conv2d(256, 128, kernel_size=1), - nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=3), # conv10_2 - nn.ReLU(inplace=True), - ), - 256, - ), - build_feature_map_block( - ( - nn.Conv2d(256, 128, kernel_size=1), - nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=3), # conv11_2 - nn.ReLU(inplace=True), - ), - 256, - ), - ]) - - return MultiFeatureMap(feature_maps) + return SSDFeatureExtractorVGG(backbone) def ssd_vgg16(pretrained=False, progress=True, From 869ede47395a6e081b77943cef50371527e7b758 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 8 Mar 2021 17:54:11 +0000 Subject: [PATCH 06/92] Adding header vgg layers. --- torchvision/models/detection/ssd.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 3d3dd3d7fba..2da28cee0ad 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -35,6 +35,9 @@ def forward(self, x: List[Tensor]) -> Dict[str, Tensor]: class SSDClassificationHead(nn.Module): def __init__(self, in_channels, num_anchors, num_classes): super().__init__() + self.cls_logits = nn.ModuleList() + for channels, anchors in zip(in_channels, num_anchors): + self.cls_logits.append(nn.Conv2d(channels, num_classes * anchors, kernel_size=3, padding=1)) def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], matched_idxs: List[Tensor]) -> Tensor: @@ -47,6 +50,9 @@ def forward(self, x: List[Tensor]) -> Tensor: class SSDRegressionHead(nn.Module): def __init__(self, in_channels, num_anchors): super().__init__() + self.bbox_reg = nn.ModuleList() + for channels, anchors in zip(in_channels, num_anchors): + self.bbox_reg.append(nn.Conv2d(channels, 4 * anchors, kernel_size=3, padding=1)) def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], anchors: List[Tensor], matched_idxs: List[Tensor]) -> Tensor: @@ -59,10 +65,23 @@ def forward(self, x: List[Tensor]) -> Tensor: class SSD(nn.Module): def __init__(self, backbone, num_classes, num_anchors=(4, 6, 6, 6, 4, 4)): super().__init__() + + assert len(backbone.OUT_CHANNELS) == len(num_anchors) + self.backbone = backbone self.num_classes = num_classes self.num_anchors = num_anchors + self.head = SSDHead(backbone.OUT_CHANNELS, num_anchors, num_classes) + + @torch.jit.unused + def eager_outputs(self, losses, detections): + # type: (Dict[str, Tensor], List[Dict[str, Tensor]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]] + if self.training: + return losses + + return detections + def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], anchors: List[Tensor]) -> Dict[str, Tensor]: pass @@ -146,7 +165,7 @@ def forward(self, x): return output -def _vgg_mfm_backbone(backbone_name, pretrained, trainable_layers=3): +def _vgg_backbone(backbone_name, pretrained, trainable_layers=3): backbone = vgg.__dict__[backbone_name](pretrained=pretrained).features # Gather the indices of maxpools. These are the locations of output blocks. @@ -173,7 +192,7 @@ def ssd_vgg16(pretrained=False, progress=True, # no need to download the backbone if pretrained is set pretrained_backbone = False - backbone = _vgg_mfm_backbone("vgg16", pretrained_backbone, trainable_layers=trainable_backbone_layers) + backbone = _vgg_backbone("vgg16", pretrained_backbone, trainable_layers=trainable_backbone_layers) model = SSD(backbone, num_classes, **kwargs) if pretrained: pass # TODO: load pre-trained COCO weights From c5ba9c1ea5492068613e5aa2dcaf2b88cadd9450 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 10 Mar 2021 20:48:46 +0000 Subject: [PATCH 07/92] Fix maxpool patching. --- torchvision/models/detection/ssd.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 2da28cee0ad..79ad4892d25 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -102,23 +102,20 @@ class SSDFeatureExtractorVGG(nn.Module): def __init__(self, backbone: nn.Module): super().__init__() - # Patch ceil_mode for all maxpool layers of backbone to get the same WxH output sizes as the paper - penultimate_block_pos = ultimate_block_pos = None - for i, layer in enumerate(backbone): - if isinstance(layer, nn.MaxPool2d): - layer.ceil_mode = True - penultimate_block_pos = ultimate_block_pos - ultimate_block_pos = i + _, _, maxpool3_pos, maxpool4_pos, _ = (i for i, layer in enumerate(backbone) if isinstance(layer, nn.MaxPool2d)) + + # Patch ceil_mode for maxpool3 to get the same WxH output sizes as the paper + backbone[maxpool3_pos].ceil_mode = True # parameters used for L2 regularization + rescaling self.scale_weight = nn.Parameter(torch.ones(self.OUT_CHANNELS[0]) * 20) # Multiple Feature maps - page 4, Fig 2 of SSD paper self.block1 = nn.Sequential( - *backbone[:penultimate_block_pos] # until conv4_3 + *backbone[:maxpool4_pos] # until conv4_3 ) self.block2 = nn.Sequential( - *backbone[penultimate_block_pos:-1], # until conv5_3, skip maxpool5 + *backbone[maxpool4_pos:-1], # until conv5_3, skip maxpool5 nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=True), # add modified maxpool5 nn.Conv2d(in_channels=self.OUT_CHANNELS[0], out_channels=1024, kernel_size=3, padding=6, dilation=6), # FC6 with atrous From c91bfae57b56d512a027d59c44315d0addfbcf99 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 11 Mar 2021 01:08:22 +0000 Subject: [PATCH 08/92] Refactoring code to allow for support of different backbones & sizes: - Skeleton for Default Boxes generator class - Dynamic estimation of configuration when possible - Addition of types --- torchvision/models/detection/anchor_utils.py | 29 +++++++++ torchvision/models/detection/ssd.py | 67 ++++++++++++-------- 2 files changed, 68 insertions(+), 28 deletions(-) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index a40a9aad699..816f9ddecab 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -156,3 +156,32 @@ def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Ten # Clear the cache in case that memory leaks. self._cache.clear() return anchors + + +class DBoxGenerator(nn.Module): + + def __init__(self, size: int, feature_map_sizes: List[int], aspect_ratios: List[List[int]], + min_ratio: float = 0.15, max_ratio: float = 0.9): + super().__init__() + self.size = size + self.feature_map_sizes = feature_map_sizes + self.aspect_ratios = aspect_ratios + + # Inspired from https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_pascal.py#L311-L317 + min_centile = int(100 * min_ratio) + max_centile = int(100 * max_ratio) + conv4_centile = min_centile // 2 # assume half of min_ratio as in paper + step = (max_centile - min_centile) // (len(feature_map_sizes) - 2) + box_centiles = [conv4_centile, min_centile] + for centile in range(min_centile, max_centile + 1, step): + box_centiles.append(centile + step) + self.box_sizes = [size * c // 100 for c in box_centiles] + + def __repr__(self) -> str: + s = self.__class__.__name__ + '(' + s += 'size={size}' + s += ', feature_map_sizes={feature_map_sizes}' + s += ', aspect_ratios={aspect_ratios}' + s += ', box_sizes={box_sizes}' + s += ')' + return s.format(**self.__dict__) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 79ad4892d25..94f8da0079b 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -2,8 +2,9 @@ import torch.nn.functional as F from torch import nn, Tensor -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple +from .anchor_utils import DBoxGenerator from .backbone_utils import _validate_trainable_layers from .. import vgg @@ -13,7 +14,7 @@ class SSDHead(nn.Module): # TODO: Similar to RetinaNetHead. Perhaps abstract and reuse for one-shot detectors. - def __init__(self, in_channels, num_anchors, num_classes): + def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int): super().__init__() self.classification_head = SSDClassificationHead(in_channels, num_anchors, num_classes) self.regression_head = SSDRegressionHead(in_channels, num_anchors) @@ -33,7 +34,7 @@ def forward(self, x: List[Tensor]) -> Dict[str, Tensor]: class SSDClassificationHead(nn.Module): - def __init__(self, in_channels, num_anchors, num_classes): + def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int): super().__init__() self.cls_logits = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): @@ -48,7 +49,7 @@ def forward(self, x: List[Tensor]) -> Tensor: class SSDRegressionHead(nn.Module): - def __init__(self, in_channels, num_anchors): + def __init__(self, in_channels: List[int], num_anchors: List[int]): super().__init__() self.bbox_reg = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): @@ -63,16 +64,31 @@ def forward(self, x: List[Tensor]) -> Tensor: class SSD(nn.Module): - def __init__(self, backbone, num_classes, num_anchors=(4, 6, 6, 6, 4, 4)): + def __init__(self, backbone: nn.Module, num_classes: int, size: int = 300, + aspect_ratios: Optional[List[List[int]]] = None): super().__init__() - assert len(backbone.OUT_CHANNELS) == len(num_anchors) + if aspect_ratios is None: + aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] + + # Use dummy data to retrieve the feature map sizes to avoid hard-coding their values + device = next(backbone.parameters()).device + tmp_img = torch.randn((1, 3, size, size), device=device) + tmp_sizes = [x.size() for x in backbone(tmp_img)] + out_channels = [x[1] for x in tmp_sizes] + feature_map_sizes = [x[2] for x in tmp_sizes] + + assert len(feature_map_sizes) == len(aspect_ratios) self.backbone = backbone self.num_classes = num_classes - self.num_anchors = num_anchors + self.aspect_ratios = aspect_ratios + + # Estimate num of anchors based on aspect ratios: 2 default boxes + 2 * ratios of feaure map. + num_anchors = [2 + 2 * len(r) for r in aspect_ratios] + self.head = SSDHead(out_channels, num_anchors, num_classes) - self.head = SSDHead(backbone.OUT_CHANNELS, num_anchors, num_classes) + self.dbox_generator = DBoxGenerator(size, feature_map_sizes, aspect_ratios) @torch.jit.unused def eager_outputs(self, losses, detections): @@ -96,19 +112,15 @@ def forward(self, images: List[Tensor], class SSDFeatureExtractorVGG(nn.Module): - - OUT_CHANNELS = (512, 1024, 512, 256, 256, 256) - def __init__(self, backbone: nn.Module): super().__init__() - _, _, maxpool3_pos, maxpool4_pos, _ = (i for i, layer in enumerate(backbone) if isinstance(layer, nn.MaxPool2d)) # Patch ceil_mode for maxpool3 to get the same WxH output sizes as the paper backbone[maxpool3_pos].ceil_mode = True # parameters used for L2 regularization + rescaling - self.scale_weight = nn.Parameter(torch.ones(self.OUT_CHANNELS[0]) * 20) + self.scale_weight = nn.Parameter(torch.ones(512) * 20) # Multiple Feature maps - page 4, Fig 2 of SSD paper self.block1 = nn.Sequential( @@ -117,38 +129,37 @@ def __init__(self, backbone: nn.Module): self.block2 = nn.Sequential( *backbone[maxpool4_pos:-1], # until conv5_3, skip maxpool5 nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=True), # add modified maxpool5 - nn.Conv2d(in_channels=self.OUT_CHANNELS[0], - out_channels=1024, kernel_size=3, padding=6, dilation=6), # FC6 with atrous + nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6), # FC6 with atrous nn.ReLU(inplace=True), - nn.Conv2d(in_channels=1024, out_channels=self.OUT_CHANNELS[1], kernel_size=1), # FC7 + nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1), # FC7 nn.ReLU(inplace=True) ) self.block3 = nn.Sequential( - nn.Conv2d(self.OUT_CHANNELS[1], 256, kernel_size=1), + nn.Conv2d(1024, 256, kernel_size=1), nn.ReLU(inplace=True), - nn.Conv2d(256, self.OUT_CHANNELS[2], kernel_size=3, padding=1, stride=2), # conv8_2 + nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2), # conv8_2 nn.ReLU(inplace=True), ) self.block4 = nn.Sequential( - nn.Conv2d(self.OUT_CHANNELS[2], 128, kernel_size=1), + nn.Conv2d(512, 128, kernel_size=1), nn.ReLU(inplace=True), - nn.Conv2d(128, self.OUT_CHANNELS[3], kernel_size=3, padding=1, stride=2), # conv9_2 + nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2), # conv9_2 nn.ReLU(inplace=True), ) self.block5 = nn.Sequential( - nn.Conv2d(self.OUT_CHANNELS[3], 128, kernel_size=1), + nn.Conv2d(256, 128, kernel_size=1), nn.ReLU(inplace=True), - nn.Conv2d(128, self.OUT_CHANNELS[4], kernel_size=3), # conv10_2 + nn.Conv2d(128, 256, kernel_size=3), # conv10_2 nn.ReLU(inplace=True), ) self.block6 = nn.Sequential( - nn.Conv2d(self.OUT_CHANNELS[4], 128, kernel_size=1), + nn.Conv2d(256, 128, kernel_size=1), nn.ReLU(inplace=True), - nn.Conv2d(128, self.OUT_CHANNELS[5], kernel_size=3), # conv11_2 + nn.Conv2d(128, 256, kernel_size=3), # conv11_2 nn.ReLU(inplace=True), ) - def forward(self, x): + def forward(self, x: Tensor) -> List[Tensor]: # L2 regularization + Rescaling of 1st block's feature map x = self.block1(x) rescaled = self.scale_weight.view(1, -1, 1, 1) * F.normalize(x) @@ -162,7 +173,7 @@ def forward(self, x): return output -def _vgg_backbone(backbone_name, pretrained, trainable_layers=3): +def _vgg_backbone(backbone_name: str, pretrained: bool, trainable_layers: int = 3): backbone = vgg.__dict__[backbone_name](pretrained=pretrained).features # Gather the indices of maxpools. These are the locations of output blocks. @@ -180,8 +191,8 @@ def _vgg_backbone(backbone_name, pretrained, trainable_layers=3): return SSDFeatureExtractorVGG(backbone) -def ssd_vgg16(pretrained=False, progress=True, - num_classes=91, pretrained_backbone=True, trainable_backbone_layers=None, **kwargs): +def ssd_vgg16(pretrained: bool = False, progress: bool = True, num_classes: int = 91, pretrained_backbone: bool = True, + trainable_backbone_layers: Optional[int] = None, **kwargs: Any): trainable_backbone_layers = _validate_trainable_layers( pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3) From 3820e097664f805b237741b3c4dda91fceed28cb Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 11 Mar 2021 14:36:54 +0000 Subject: [PATCH 09/92] Complete the implementation of DefaultBox generator. --- torchvision/models/detection/anchor_utils.py | 42 +++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index 816f9ddecab..1582b1d113c 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -1,4 +1,6 @@ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import itertools +import math import torch from torch import nn, Tensor @@ -167,21 +169,51 @@ def __init__(self, size: int, feature_map_sizes: List[int], aspect_ratios: List[ self.feature_map_sizes = feature_map_sizes self.aspect_ratios = aspect_ratios + # Estimation of default boxes scales # Inspired from https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_pascal.py#L311-L317 min_centile = int(100 * min_ratio) max_centile = int(100 * max_ratio) conv4_centile = min_centile // 2 # assume half of min_ratio as in paper step = (max_centile - min_centile) // (len(feature_map_sizes) - 2) - box_centiles = [conv4_centile, min_centile] - for centile in range(min_centile, max_centile + 1, step): - box_centiles.append(centile + step) - self.box_sizes = [size * c // 100 for c in box_centiles] + centiles = [conv4_centile, min_centile] + for c in range(min_centile, max_centile + 1, step): + centiles.append(c + step) + self.scales = [c / 100 for c in centiles] + + # Default Boxes pre-calculation based on page 6 of SSD paper + self._dboxes = [] + for k, f_k in enumerate(self.feature_map_sizes): + # Adding the 2 default width-height pairs for aspect ratio 1 and scale s'k + s_prime_k = math.sqrt(self.scales[k] * self.scales[k + 1]) + wh_pairs = [(self.scales[k], self.scales[k]), (s_prime_k, s_prime_k)] + + # Adding 2 pairs for each aspect ratio of the feature map k + for ar in self.aspect_ratios[k]: + sq_ar = math.sqrt(ar) + w = self.scales[k] * sq_ar + h = self.scales[k] / sq_ar + wh_pairs.extend([(w, h), (h, w)]) + + # Now add the default boxes for each width-height pair + for i, j in itertools.product(range(f_k), repeat=2): + cx = (i + 0.5) / f_k + cy = (j + 0.5) / f_k + self._dboxes.extend((cx, cy, w, h) for w, h in wh_pairs) def __repr__(self) -> str: s = self.__class__.__name__ + '(' s += 'size={size}' s += ', feature_map_sizes={feature_map_sizes}' s += ', aspect_ratios={aspect_ratios}' - s += ', box_sizes={box_sizes}' + s += ', scales={scales}' s += ')' return s.format(**self.__dict__) + + def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Tensor]: + dtype, device = feature_maps[0].dtype, feature_maps[0].device + dboxes = [] + for i in range(len(image_list.image_sizes)): + dboxes_in_image = torch.tensor(self._dboxes, dtype=dtype, device=device) + dboxes_in_image.clamp_(min=0, max=1) + dboxes.append(dboxes_in_image) + return dboxes From 044d17843458a0046e1a502baa76af454188e588 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 11 Mar 2021 14:40:02 +0000 Subject: [PATCH 10/92] Replace randn with empty. --- torchvision/models/detection/ssd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 94f8da0079b..184234f8a30 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -73,7 +73,7 @@ def __init__(self, backbone: nn.Module, num_classes: int, size: int = 300, # Use dummy data to retrieve the feature map sizes to avoid hard-coding their values device = next(backbone.parameters()).device - tmp_img = torch.randn((1, 3, size, size), device=device) + tmp_img = torch.empty((1, 3, size, size), device=device) tmp_sizes = [x.size() for x in backbone(tmp_img)] out_channels = [x[1] for x in tmp_sizes] feature_map_sizes = [x[2] for x in tmp_sizes] From 6a7b9b42f817e0d7c22fc785fc92d6423aea8f0d Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 11 Mar 2021 18:01:44 +0000 Subject: [PATCH 11/92] Minor refactoring --- torchvision/models/detection/anchor_utils.py | 12 +++++++----- torchvision/models/detection/ssd.py | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index 1582b1d113c..b1f1cb7d77d 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -181,17 +181,19 @@ def __init__(self, size: int, feature_map_sizes: List[int], aspect_ratios: List[ self.scales = [c / 100 for c in centiles] # Default Boxes pre-calculation based on page 6 of SSD paper + clip01 = lambda x: max(min(x, 1.0), 0.0) self._dboxes = [] for k, f_k in enumerate(self.feature_map_sizes): # Adding the 2 default width-height pairs for aspect ratio 1 and scale s'k - s_prime_k = math.sqrt(self.scales[k] * self.scales[k + 1]) - wh_pairs = [(self.scales[k], self.scales[k]), (s_prime_k, s_prime_k)] + s_k = clip01(self.scales[k]) + s_prime_k = clip01(math.sqrt(self.scales[k] * self.scales[k + 1])) + wh_pairs = [(s_k, s_k), (s_prime_k, s_prime_k)] # Adding 2 pairs for each aspect ratio of the feature map k for ar in self.aspect_ratios[k]: sq_ar = math.sqrt(ar) - w = self.scales[k] * sq_ar - h = self.scales[k] / sq_ar + w = clip01(self.scales[k] * sq_ar) + h = clip01(self.scales[k] / sq_ar) wh_pairs.extend([(w, h), (h, w)]) # Now add the default boxes for each width-height pair @@ -199,6 +201,7 @@ def __init__(self, size: int, feature_map_sizes: List[int], aspect_ratios: List[ cx = (i + 0.5) / f_k cy = (j + 0.5) / f_k self._dboxes.extend((cx, cy, w, h) for w, h in wh_pairs) + # self._dboxes.extend((cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h) for w, h in wh_pairs) def __repr__(self) -> str: s = self.__class__.__name__ + '(' @@ -214,6 +217,5 @@ def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Ten dboxes = [] for i in range(len(image_list.image_sizes)): dboxes_in_image = torch.tensor(self._dboxes, dtype=dtype, device=device) - dboxes_in_image.clamp_(min=0, max=1) dboxes.append(dboxes_in_image) return dboxes diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 184234f8a30..ab09816726d 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -112,6 +112,7 @@ def forward(self, images: List[Tensor], class SSDFeatureExtractorVGG(nn.Module): + # TODO: That's the SSD300 extractor. handle the SDD500 case as well. See page 11, footernote 5. def __init__(self, backbone: nn.Module): super().__init__() _, _, maxpool3_pos, maxpool4_pos, _ = (i for i, layer in enumerate(backbone) if isinstance(layer, nn.MaxPool2d)) From e85e631fefc8def30a20624f215dd3c89080cd2f Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 11 Mar 2021 18:25:09 +0000 Subject: [PATCH 12/92] Making clamping between 0 and 1 optional. --- torchvision/models/detection/anchor_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index b1f1cb7d77d..c170aa0e222 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -163,7 +163,7 @@ def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Ten class DBoxGenerator(nn.Module): def __init__(self, size: int, feature_map_sizes: List[int], aspect_ratios: List[List[int]], - min_ratio: float = 0.15, max_ratio: float = 0.9): + min_ratio: float = 0.15, max_ratio: float = 0.9, clip: bool = False): super().__init__() self.size = size self.feature_map_sizes = feature_map_sizes @@ -181,19 +181,19 @@ def __init__(self, size: int, feature_map_sizes: List[int], aspect_ratios: List[ self.scales = [c / 100 for c in centiles] # Default Boxes pre-calculation based on page 6 of SSD paper - clip01 = lambda x: max(min(x, 1.0), 0.0) + clamp01 = (lambda x: max(min(x, 1.0), 0.0)) if clip else (lambda x: x) self._dboxes = [] for k, f_k in enumerate(self.feature_map_sizes): # Adding the 2 default width-height pairs for aspect ratio 1 and scale s'k - s_k = clip01(self.scales[k]) - s_prime_k = clip01(math.sqrt(self.scales[k] * self.scales[k + 1])) + s_k = clamp01(self.scales[k]) + s_prime_k = clamp01(math.sqrt(self.scales[k] * self.scales[k + 1])) wh_pairs = [(s_k, s_k), (s_prime_k, s_prime_k)] # Adding 2 pairs for each aspect ratio of the feature map k for ar in self.aspect_ratios[k]: sq_ar = math.sqrt(ar) - w = clip01(self.scales[k] * sq_ar) - h = clip01(self.scales[k] / sq_ar) + w = clamp01(self.scales[k] * sq_ar) + h = clamp01(self.scales[k] / sq_ar) wh_pairs.extend([(w, h), (h, w)]) # Now add the default boxes for each width-height pair From 327e004850a3d0b56e8dfe0820e22eca70402309 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 12 Mar 2021 18:26:55 +0000 Subject: [PATCH 13/92] Change xywh to xyxy encoding. --- torchvision/models/detection/anchor_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index c170aa0e222..8fcbf873d8b 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -200,8 +200,7 @@ def __init__(self, size: int, feature_map_sizes: List[int], aspect_ratios: List[ for i, j in itertools.product(range(f_k), repeat=2): cx = (i + 0.5) / f_k cy = (j + 0.5) / f_k - self._dboxes.extend((cx, cy, w, h) for w, h in wh_pairs) - # self._dboxes.extend((cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h) for w, h in wh_pairs) + self._dboxes.extend((cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h) for w, h in wh_pairs) def __repr__(self) -> str: s = self.__class__.__name__ + '(' From 11c98391dfadbd51bf8a58a77f461ed0ffd5bf54 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 12 Mar 2021 19:13:03 +0000 Subject: [PATCH 14/92] Adding parameters and reusing objects in constructor. --- torchvision/models/detection/ssd.py | 38 +++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index ab09816726d..05cd21102d3 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -4,7 +4,9 @@ from torch import nn, Tensor from typing import Any, Dict, List, Optional, Tuple +from . import _utils as det_utils from .anchor_utils import DBoxGenerator +from .transform import GeneralizedRCNNTransform from .backbone_utils import _validate_trainable_layers from .. import vgg @@ -64,8 +66,14 @@ def forward(self, x: List[Tensor]) -> Tensor: class SSD(nn.Module): - def __init__(self, backbone: nn.Module, num_classes: int, size: int = 300, - aspect_ratios: Optional[List[List[int]]] = None): + def __init__(self, backbone: nn.Module, num_classes: int, + size: int = 300, image_mean: Optional[List[float]] = None, image_std: Optional[List[float]] = None, + aspect_ratios: Optional[List[List[int]]] = None, + score_thresh: float = 0.01, + nms_thresh: float = 0.45, + detections_per_img: int = 200, + iou_thresh: float = 0.5, + topk_candidates: int = 400): super().__init__() if aspect_ratios is None: @@ -81,14 +89,34 @@ def __init__(self, backbone: nn.Module, num_classes: int, size: int = 300, assert len(feature_map_sizes) == len(aspect_ratios) self.backbone = backbone - self.num_classes = num_classes - self.aspect_ratios = aspect_ratios # Estimate num of anchors based on aspect ratios: 2 default boxes + 2 * ratios of feaure map. num_anchors = [2 + 2 * len(r) for r in aspect_ratios] self.head = SSDHead(out_channels, num_anchors, num_classes) - self.dbox_generator = DBoxGenerator(size, feature_map_sizes, aspect_ratios) + self.anchor_generator = DBoxGenerator(size, feature_map_sizes, aspect_ratios) + + self.proposal_matcher = det_utils.Matcher( + iou_thresh, + iou_thresh, + allow_low_quality_matches=True, + ) + + self.box_coder = det_utils.BoxCoder(weights=(10., 10., 5., 5.)) + + if image_mean is None: + image_mean = [0.485, 0.456, 0.406] + if image_std is None: + image_std = [0.229, 0.224, 0.225] + self.transform = GeneralizedRCNNTransform(size, size, image_mean, image_std) + + self.score_thresh = score_thresh + self.nms_thresh = nms_thresh + self.detections_per_img = detections_per_img + self.topk_candidates = topk_candidates + + # used only on torchscript mode + self._has_warned = False @torch.jit.unused def eager_outputs(self, losses, detections): From 34237e4a250a1eeca2b188ecb5555c00d36486b0 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 12 Mar 2021 19:59:38 +0000 Subject: [PATCH 15/92] Temporarily inherit from Retina to avoid dup code. --- torchvision/models/detection/ssd.py | 49 +++++++---------------------- 1 file changed, 11 insertions(+), 38 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 05cd21102d3..daac4f7a02f 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -1,39 +1,28 @@ import torch import torch.nn.functional as F +from collections import OrderedDict from torch import nn, Tensor from typing import Any, Dict, List, Optional, Tuple from . import _utils as det_utils from .anchor_utils import DBoxGenerator -from .transform import GeneralizedRCNNTransform from .backbone_utils import _validate_trainable_layers +from .transform import GeneralizedRCNNTransform from .. import vgg +from .retinanet import RetinaNet, RetinaNetHead # TODO: Refactor both to inherit properly + __all__ = ['SSD'] -class SSDHead(nn.Module): - # TODO: Similar to RetinaNetHead. Perhaps abstract and reuse for one-shot detectors. +class SSDHead(RetinaNetHead): def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int): - super().__init__() + nn.Module.__init__(self) self.classification_head = SSDClassificationHead(in_channels, num_anchors, num_classes) self.regression_head = SSDRegressionHead(in_channels, num_anchors) - def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], anchors: List[Tensor], - matched_idxs: List[Tensor]) -> Dict[str, Tensor]: - return { - 'classification': self.classification_head.compute_loss(targets, head_outputs, matched_idxs), - 'bbox_regression': self.regression_head.compute_loss(targets, head_outputs, anchors, matched_idxs), - } - - def forward(self, x: List[Tensor]) -> Dict[str, Tensor]: - return { - 'cls_logits': self.classification_head(x), - 'bbox_regression': self.regression_head(x) - } - class SSDClassificationHead(nn.Module): def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int): @@ -65,7 +54,7 @@ def forward(self, x: List[Tensor]) -> Tensor: pass -class SSD(nn.Module): +class SSD(RetinaNet): def __init__(self, backbone: nn.Module, num_classes: int, size: int = 300, image_mean: Optional[List[float]] = None, image_std: Optional[List[float]] = None, aspect_ratios: Optional[List[List[int]]] = None, @@ -74,7 +63,7 @@ def __init__(self, backbone: nn.Module, num_classes: int, detections_per_img: int = 200, iou_thresh: float = 0.5, topk_candidates: int = 400): - super().__init__() + nn.Module.__init__(self) if aspect_ratios is None: aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] @@ -82,7 +71,7 @@ def __init__(self, backbone: nn.Module, num_classes: int, # Use dummy data to retrieve the feature map sizes to avoid hard-coding their values device = next(backbone.parameters()).device tmp_img = torch.empty((1, 3, size, size), device=device) - tmp_sizes = [x.size() for x in backbone(tmp_img)] + tmp_sizes = [x.size() for x in backbone(tmp_img).values()] out_channels = [x[1] for x in tmp_sizes] feature_map_sizes = [x[2] for x in tmp_sizes] @@ -118,26 +107,10 @@ def __init__(self, backbone: nn.Module, num_classes: int, # used only on torchscript mode self._has_warned = False - @torch.jit.unused - def eager_outputs(self, losses, detections): - # type: (Dict[str, Tensor], List[Dict[str, Tensor]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]] - if self.training: - return losses - - return detections - def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], anchors: List[Tensor]) -> Dict[str, Tensor]: pass - def postprocess_detections(self, head_outputs: Dict[str, List[Tensor]], anchors: List[List[Tensor]], - image_shapes: List[Tuple[int, int]]) -> List[Dict[str, Tensor]]: - pass - - def forward(self, images: List[Tensor], - targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]: - pass - class SSDFeatureExtractorVGG(nn.Module): # TODO: That's the SSD300 extractor. handle the SDD500 case as well. See page 11, footernote 5. @@ -188,7 +161,7 @@ def __init__(self, backbone: nn.Module): nn.ReLU(inplace=True), ) - def forward(self, x: Tensor) -> List[Tensor]: + def forward(self, x: Tensor) -> Dict[str, Tensor]: # L2 regularization + Rescaling of 1st block's feature map x = self.block1(x) rescaled = self.scale_weight.view(1, -1, 1, 1) * F.normalize(x) @@ -199,7 +172,7 @@ def forward(self, x: Tensor) -> List[Tensor]: x = block(x) output.append(x) - return output + return OrderedDict(((str(i), v) for i, v in enumerate(output))) def _vgg_backbone(backbone_name: str, pretrained: bool, trainable_layers: int = 3): From d3f345e62efe817d3cbae26d75db299d58c3b5f4 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 12 Mar 2021 21:37:05 +0000 Subject: [PATCH 16/92] Implement forward methods + temp workarounds to inherit from retina. --- torchvision/models/detection/retinanet.py | 17 +++-- torchvision/models/detection/ssd.py | 79 +++++++++++++++++------ torchvision/models/detection/transform.py | 5 +- 3 files changed, 74 insertions(+), 27 deletions(-) diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index f34db4ce970..446489db989 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -454,6 +454,15 @@ def postprocess_detections(self, head_outputs, anchors, image_shapes): return detections + def _anchors_per_level(self, features, HWA): + # recover level sizes + num_anchors_per_level = [x.size(2) * x.size(3) for x in features] + HW = 0 + for v in num_anchors_per_level: + HW += v + A = HWA // HW + return [hw * A for hw in num_anchors_per_level] + def forward(self, images, targets=None): # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]] """ @@ -531,13 +540,7 @@ def forward(self, images, targets=None): losses = self.compute_loss(targets, head_outputs, anchors) else: # recover level sizes - num_anchors_per_level = [x.size(2) * x.size(3) for x in features] - HW = 0 - for v in num_anchors_per_level: - HW += v - HWA = head_outputs['cls_logits'].size(1) - A = HWA // HW - num_anchors_per_level = [hw * A for hw in num_anchors_per_level] + num_anchors_per_level = self._anchors_per_level(features, head_outputs['cls_logits'].size(1)) # split outputs per level split_head_outputs: Dict[str, List[Tensor]] = {} diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index daac4f7a02f..058e581479e 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -24,35 +24,68 @@ def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: self.regression_head = SSDRegressionHead(in_channels, num_anchors) -class SSDClassificationHead(nn.Module): - def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int): +class SSDScoringHead(nn.Module): + def __init__(self, module_list: nn.ModuleList, num_columns: int): super().__init__() - self.cls_logits = nn.ModuleList() + self.module_list = module_list + self.num_columns = num_columns + + def get_result_from_module_list(self, x: Tensor, idx: int) -> Tensor: + """ + This is equivalent to self.module_list[idx](x), + but torchscript doesn't support this yet + """ + num_blocks = len(self.module_list) + if idx < 0: + idx += num_blocks + i = 0 + out = x + for module in self.module_list: + if i == idx: + out = module(x) + i += 1 + return out + + def forward(self, x: List[Tensor]) -> Tensor: + all_results = [] + + for i, features in enumerate(x): + results = self.get_result_from_module_list(features, i) + + # Permute output from (N, A * K, H, W) to (N, HWA, K). + N, _, H, W = results.shape + results = results.view(N, -1, self.num_columns, H, W) + results = results.permute(0, 3, 4, 1, 2) + results = results.reshape(N, -1, self.num_columns) # Size=(N, HWA, K) + + all_results.append(results) + + return torch.cat(all_results, dim=1) + + +class SSDClassificationHead(SSDScoringHead): + def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int): + cls_logits = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): - self.cls_logits.append(nn.Conv2d(channels, num_classes * anchors, kernel_size=3, padding=1)) + cls_logits.append(nn.Conv2d(channels, num_classes * anchors, kernel_size=3, padding=1)) + super().__init__(cls_logits, num_classes) def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], matched_idxs: List[Tensor]) -> Tensor: pass - def forward(self, x: List[Tensor]) -> Tensor: - pass - -class SSDRegressionHead(nn.Module): +class SSDRegressionHead(SSDScoringHead): def __init__(self, in_channels: List[int], num_anchors: List[int]): - super().__init__() - self.bbox_reg = nn.ModuleList() + bbox_reg = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): - self.bbox_reg.append(nn.Conv2d(channels, 4 * anchors, kernel_size=3, padding=1)) + bbox_reg.append(nn.Conv2d(channels, 4 * anchors, kernel_size=3, padding=1)) + super().__init__(bbox_reg, 4) def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], anchors: List[Tensor], matched_idxs: List[Tensor]) -> Tensor: pass - def forward(self, x: List[Tensor]) -> Tensor: - pass - class SSD(RetinaNet): def __init__(self, backbone: nn.Module, num_classes: int, @@ -80,8 +113,8 @@ def __init__(self, backbone: nn.Module, num_classes: int, self.backbone = backbone # Estimate num of anchors based on aspect ratios: 2 default boxes + 2 * ratios of feaure map. - num_anchors = [2 + 2 * len(r) for r in aspect_ratios] - self.head = SSDHead(out_channels, num_anchors, num_classes) + self.num_anchors = [2 + 2 * len(r) for r in aspect_ratios] + self.head = SSDHead(out_channels, self.num_anchors, num_classes) self.anchor_generator = DBoxGenerator(size, feature_map_sizes, aspect_ratios) @@ -97,7 +130,8 @@ def __init__(self, backbone: nn.Module, num_classes: int, image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] - self.transform = GeneralizedRCNNTransform(size, size, image_mean, image_std) + self.transform = GeneralizedRCNNTransform(size, size, image_mean, image_std, + size_divisible=1) # TODO: Discuss/refactor this workaround self.score_thresh = score_thresh self.nms_thresh = nms_thresh @@ -107,6 +141,15 @@ def __init__(self, backbone: nn.Module, num_classes: int, # used only on torchscript mode self._has_warned = False + def _anchors_per_level(self, features, HWA): + # TODO: Discuss/refactor this workaround + num_anchors_per_level = [x.size(2) * x.size(3) * anchors for x, anchors in zip(features, self.num_anchors)] + HW = 0 + for v in num_anchors_per_level: + HW += v + A = HWA // HW + return [hw * A for hw in num_anchors_per_level] + def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], anchors: List[Tensor]) -> Dict[str, Tensor]: pass @@ -203,7 +246,7 @@ def ssd_vgg16(pretrained: bool = False, progress: bool = True, num_classes: int pretrained_backbone = False backbone = _vgg_backbone("vgg16", pretrained_backbone, trainable_layers=trainable_backbone_layers) - model = SSD(backbone, num_classes, **kwargs) + model = SSD(backbone, num_classes, **kwargs) # TODO: fix initializations in all new layers if pretrained: pass # TODO: load pre-trained COCO weights return model diff --git a/torchvision/models/detection/transform.py b/torchvision/models/detection/transform.py index 215005637eb..2043fbd4ce3 100644 --- a/torchvision/models/detection/transform.py +++ b/torchvision/models/detection/transform.py @@ -66,7 +66,7 @@ class GeneralizedRCNNTransform(nn.Module): It returns a ImageList for the inputs, and a List[Dict[Tensor]] for the targets """ - def __init__(self, min_size, max_size, image_mean, image_std): + def __init__(self, min_size, max_size, image_mean, image_std, size_divisible=32): super(GeneralizedRCNNTransform, self).__init__() if not isinstance(min_size, (list, tuple)): min_size = (min_size,) @@ -74,6 +74,7 @@ def __init__(self, min_size, max_size, image_mean, image_std): self.max_size = max_size self.image_mean = image_mean self.image_std = image_std + self.size_divisible = size_divisible def forward(self, images, # type: List[Tensor] @@ -107,7 +108,7 @@ def forward(self, targets[i] = target_index image_sizes = [img.shape[-2:] for img in images] - images = self.batch_images(images) + images = self.batch_images(images, size_divisible=self.size_divisible) image_sizes_list: List[Tuple[int, int]] = [] for image_size in image_sizes: assert len(image_size) == 2 From ac251588ae2cdf11e4c878e150dc8813fd0d297f Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 12 Mar 2021 21:40:26 +0000 Subject: [PATCH 17/92] Inherit more methods from retinanet. --- torchvision/models/detection/ssd.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 058e581479e..0e70b840a50 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -150,10 +150,6 @@ def _anchors_per_level(self, features, HWA): A = HWA // HW return [hw * A for hw in num_anchors_per_level] - def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], - anchors: List[Tensor]) -> Dict[str, Tensor]: - pass - class SSDFeatureExtractorVGG(nn.Module): # TODO: That's the SSD300 extractor. handle the SDD500 case as well. See page 11, footernote 5. From eed06f4a2c2d56dedebe0cc274d438feb44325ba Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 31 Mar 2021 09:40:01 +0100 Subject: [PATCH 18/92] Fix type error. --- torchvision/models/detection/retinanet.py | 2 +- torchvision/models/detection/ssd.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index 446489db989..8bbade08663 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -454,7 +454,7 @@ def postprocess_detections(self, head_outputs, anchors, image_shapes): return detections - def _anchors_per_level(self, features, HWA): + def _anchors_per_level(self, features: List[Tensor], HWA: int): # recover level sizes num_anchors_per_level = [x.size(2) * x.size(3) for x in features] HW = 0 diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 0e70b840a50..12186447796 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -141,7 +141,7 @@ def __init__(self, backbone: nn.Module, num_classes: int, # used only on torchscript mode self._has_warned = False - def _anchors_per_level(self, features, HWA): + def _anchors_per_level(self, features: List[Tensor], HWA: int): # TODO: Discuss/refactor this workaround num_anchors_per_level = [x.size(2) * x.size(3) * anchors for x, anchors in zip(features, self.num_anchors)] HW = 0 From b185e91e6abdb5c2a0665d019270cd4b8b21441b Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 7 Apr 2021 12:28:41 +0100 Subject: [PATCH 19/92] Add Regression loss. --- torchvision/models/detection/retinanet.py | 4 ++- torchvision/models/detection/ssd.py | 33 ++++++++++++----------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index 8bbade08663..a3c84b70811 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -176,12 +176,14 @@ def __init__(self, in_channels, num_anchors): torch.nn.init.zeros_(layer.bias) self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) + self._use_smooth_l1 = False def compute_loss(self, targets, head_outputs, anchors, matched_idxs): # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor], List[Tensor]) -> Tensor losses = [] bbox_regression = head_outputs['bbox_regression'] + l1_loss = torch.nn.functional.smooth_l1_loss if self._use_smooth_l1 else torch.nn.functional.l1_loss for targets_per_image, bbox_regression_per_image, anchors_per_image, matched_idxs_per_image in \ zip(targets, bbox_regression, anchors, matched_idxs): @@ -198,7 +200,7 @@ def compute_loss(self, targets, head_outputs, anchors, matched_idxs): target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image) # compute the loss - losses.append(torch.nn.functional.l1_loss( + losses.append(l1_loss( bbox_regression_per_image, target_regression, reduction='sum' diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 12186447796..ddee069b189 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -11,17 +11,18 @@ from .transform import GeneralizedRCNNTransform from .. import vgg -from .retinanet import RetinaNet, RetinaNetHead # TODO: Refactor both to inherit properly +from .retinanet import RetinaNet, RetinaNetHead, RetinaNetRegressionHead # TODO: Refactor to inherit properly __all__ = ['SSD'] class SSDHead(RetinaNetHead): - def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int): + def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int, positive_fraction: float, + box_coder: det_utils.BoxCoder): nn.Module.__init__(self) - self.classification_head = SSDClassificationHead(in_channels, num_anchors, num_classes) - self.regression_head = SSDRegressionHead(in_channels, num_anchors) + self.classification_head = SSDClassificationHead(in_channels, num_anchors, num_classes, positive_fraction) + self.regression_head = SSDRegressionHead(in_channels, num_anchors, box_coder) class SSDScoringHead(nn.Module): @@ -64,27 +65,26 @@ def forward(self, x: List[Tensor]) -> Tensor: class SSDClassificationHead(SSDScoringHead): - def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int): + def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int, positive_fraction: float): cls_logits = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): cls_logits.append(nn.Conv2d(channels, num_classes * anchors, kernel_size=3, padding=1)) super().__init__(cls_logits, num_classes) + self.neg_to_pos_ratio = (1.0 - positive_fraction) / positive_fraction def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], matched_idxs: List[Tensor]) -> Tensor: pass -class SSDRegressionHead(SSDScoringHead): - def __init__(self, in_channels: List[int], num_anchors: List[int]): +class SSDRegressionHead(SSDScoringHead, RetinaNetRegressionHead): # TODO: Refactor to avoid multiple inheritance + def __init__(self, in_channels: List[int], num_anchors: List[int], box_coder: det_utils.BoxCoder): bbox_reg = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): bbox_reg.append(nn.Conv2d(channels, 4 * anchors, kernel_size=3, padding=1)) - super().__init__(bbox_reg, 4) - - def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], anchors: List[Tensor], - matched_idxs: List[Tensor]) -> Tensor: - pass + SSDScoringHead.__init__(self, bbox_reg, 4) + self.box_coder = box_coder + self._use_smooth_l1 = True # TODO: Discuss/refactor this workaround class SSD(RetinaNet): @@ -95,7 +95,8 @@ def __init__(self, backbone: nn.Module, num_classes: int, nms_thresh: float = 0.45, detections_per_img: int = 200, iou_thresh: float = 0.5, - topk_candidates: int = 400): + topk_candidates: int = 400, + positive_fraction: float = 0.25): nn.Module.__init__(self) if aspect_ratios is None: @@ -112,9 +113,11 @@ def __init__(self, backbone: nn.Module, num_classes: int, self.backbone = backbone + self.box_coder = det_utils.BoxCoder(weights=(10., 10., 5., 5.)) + # Estimate num of anchors based on aspect ratios: 2 default boxes + 2 * ratios of feaure map. self.num_anchors = [2 + 2 * len(r) for r in aspect_ratios] - self.head = SSDHead(out_channels, self.num_anchors, num_classes) + self.head = SSDHead(out_channels, self.num_anchors, num_classes, positive_fraction, self.box_coder) self.anchor_generator = DBoxGenerator(size, feature_map_sizes, aspect_ratios) @@ -124,8 +127,6 @@ def __init__(self, backbone: nn.Module, num_classes: int, allow_low_quality_matches=True, ) - self.box_coder = det_utils.BoxCoder(weights=(10., 10., 5., 5.)) - if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: From 3a9166f2ea78944bd5c0f7d9bde1ed51132db8f5 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 7 Apr 2021 13:38:05 +0100 Subject: [PATCH 20/92] Fixing JIT issues. --- torchvision/models/detection/retinanet.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index a3c84b70811..164e481d4a2 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -178,12 +178,17 @@ def __init__(self, in_channels, num_anchors): self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) self._use_smooth_l1 = False + def _l1_loss(self, bbox_regression_per_image, target_regression): + if self._use_smooth_l1: + return torch.nn.functional.smooth_l1_loss(bbox_regression_per_image, target_regression, reduction='sum') + else: + return torch.nn.functional.l1_loss(bbox_regression_per_image, target_regression, reduction='sum') + def compute_loss(self, targets, head_outputs, anchors, matched_idxs): # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor], List[Tensor]) -> Tensor losses = [] bbox_regression = head_outputs['bbox_regression'] - l1_loss = torch.nn.functional.smooth_l1_loss if self._use_smooth_l1 else torch.nn.functional.l1_loss for targets_per_image, bbox_regression_per_image, anchors_per_image, matched_idxs_per_image in \ zip(targets, bbox_regression, anchors, matched_idxs): @@ -200,11 +205,7 @@ def compute_loss(self, targets, head_outputs, anchors, matched_idxs): target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image) # compute the loss - losses.append(l1_loss( - bbox_regression_per_image, - target_regression, - reduction='sum' - ) / max(1, num_foreground)) + losses.append(self._l1_loss(bbox_regression_per_image, target_regression) / max(1, num_foreground)) return _sum(losses) / max(1, len(targets)) From a604ab4e7f950de6f7aa2be2360fa9b094a23fbd Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 7 Apr 2021 13:49:21 +0100 Subject: [PATCH 21/92] Change JIT workaround to minimize new code. --- torchvision/models/detection/retinanet.py | 14 ++++++-------- torchvision/models/detection/ssd.py | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index 164e481d4a2..901546defa3 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -176,13 +176,7 @@ def __init__(self, in_channels, num_anchors): torch.nn.init.zeros_(layer.bias) self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) - self._use_smooth_l1 = False - - def _l1_loss(self, bbox_regression_per_image, target_regression): - if self._use_smooth_l1: - return torch.nn.functional.smooth_l1_loss(bbox_regression_per_image, target_regression, reduction='sum') - else: - return torch.nn.functional.l1_loss(bbox_regression_per_image, target_regression, reduction='sum') + self._l1_loss = torch.nn.functional.l1_loss def compute_loss(self, targets, head_outputs, anchors, matched_idxs): # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor], List[Tensor]) -> Tensor @@ -205,7 +199,11 @@ def compute_loss(self, targets, head_outputs, anchors, matched_idxs): target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image) # compute the loss - losses.append(self._l1_loss(bbox_regression_per_image, target_regression) / max(1, num_foreground)) + losses.append(self._l1_loss( + bbox_regression_per_image, + target_regression, + reduction='sum' + ) / max(1, num_foreground)) return _sum(losses) / max(1, len(targets)) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index ddee069b189..8b6268a71a6 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -84,7 +84,7 @@ def __init__(self, in_channels: List[int], num_anchors: List[int], box_coder: de bbox_reg.append(nn.Conv2d(channels, 4 * anchors, kernel_size=3, padding=1)) SSDScoringHead.__init__(self, bbox_reg, 4) self.box_coder = box_coder - self._use_smooth_l1 = True # TODO: Discuss/refactor this workaround + self._l1_loss = torch.nn.functional.smooth_l1_loss # TODO: Discuss/refactor this workaround class SSD(RetinaNet): From 6e996d90dc2763f2b8dc230df862a205b2cdfd6d Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 7 Apr 2021 16:35:32 +0100 Subject: [PATCH 22/92] Fixing initialization bug. --- torchvision/models/detection/ssd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 8b6268a71a6..b7cfae93263 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -3,7 +3,7 @@ from collections import OrderedDict from torch import nn, Tensor -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional from . import _utils as det_utils from .anchor_utils import DBoxGenerator @@ -27,7 +27,7 @@ def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: class SSDScoringHead(nn.Module): def __init__(self, module_list: nn.ModuleList, num_columns: int): - super().__init__() + nn.Module.__init__(self) self.module_list = module_list self.num_columns = num_columns From c524ee10cf348ab019820b89042742be44dbc9b6 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 7 Apr 2021 18:00:24 +0100 Subject: [PATCH 23/92] Add classification loss. --- torchvision/models/detection/ssd.py | 31 ++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index b7cfae93263..1b41638d8ba 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -11,7 +11,7 @@ from .transform import GeneralizedRCNNTransform from .. import vgg -from .retinanet import RetinaNet, RetinaNetHead, RetinaNetRegressionHead # TODO: Refactor to inherit properly +from .retinanet import RetinaNet, RetinaNetHead, RetinaNetRegressionHead, _sum # TODO: Refactor to inherit properly __all__ = ['SSD'] @@ -74,7 +74,28 @@ def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], matched_idxs: List[Tensor]) -> Tensor: - pass + losses = [] + + cls_logits = head_outputs['cls_logits'] + + for targets_per_image, cls_logits_per_image, matched_idxs_per_image in zip(targets, cls_logits, matched_idxs): + gt_classes_target = targets_per_image['labels'][matched_idxs_per_image] + classification_loss = F.cross_entropy(cls_logits_per_image, gt_classes_target, reduce=False) + + # Hard Negative Sampling + foreground_idxs_per_image = matched_idxs_per_image >= 0 + num_foreground = foreground_idxs_per_image.sum().item() + + background_idxs_per_image = torch.logical_not(foreground_idxs_per_image) + num_background = matched_idxs_per_image.size(0) - num_foreground + num_negative = min(num_background, int(self.neg_to_pos_ratio * num_foreground)) + + foreground_loss = classification_loss[foreground_idxs_per_image] + background_loss = classification_loss[background_idxs_per_image].sort(descending=True)[0][:num_negative] + + losses.append((foreground_loss.sum() + background_loss.sum()) / max(1, num_foreground)) + + return _sum(losses) / len(targets) class SSDRegressionHead(SSDScoringHead, RetinaNetRegressionHead): # TODO: Refactor to avoid multiple inheritance @@ -121,11 +142,7 @@ def __init__(self, backbone: nn.Module, num_classes: int, self.anchor_generator = DBoxGenerator(size, feature_map_sizes, aspect_ratios) - self.proposal_matcher = det_utils.Matcher( - iou_thresh, - iou_thresh, - allow_low_quality_matches=True, - ) + self.proposal_matcher = det_utils.Matcher(iou_thresh, iou_thresh) if image_mean is None: image_mean = [0.485, 0.456, 0.406] From 44d8a0b25e3e1e4366688b9080436fce3da6ec53 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 7 Apr 2021 20:38:26 +0100 Subject: [PATCH 24/92] Update todos. --- torchvision/models/detection/ssd.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 1b41638d8ba..09e58c31275 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -14,7 +14,7 @@ from .retinanet import RetinaNet, RetinaNetHead, RetinaNetRegressionHead, _sum # TODO: Refactor to inherit properly -__all__ = ['SSD'] +__all__ = ['SSD'] # TODO: Expose public methods, include it in models and write unit-tests for them class SSDHead(RetinaNetHead): @@ -31,7 +31,7 @@ def __init__(self, module_list: nn.ModuleList, num_columns: int): self.module_list = module_list self.num_columns = num_columns - def get_result_from_module_list(self, x: Tensor, idx: int) -> Tensor: + def _get_result_from_module_list(self, x: Tensor, idx: int) -> Tensor: """ This is equivalent to self.module_list[idx](x), but torchscript doesn't support this yet @@ -51,7 +51,7 @@ def forward(self, x: List[Tensor]) -> Tensor: all_results = [] for i, features in enumerate(x): - results = self.get_result_from_module_list(features, i) + results = self._get_result_from_module_list(features, i) # Permute output from (N, A * K, H, W) to (N, HWA, K). N, _, H, W = results.shape From 15b3ebf4b62b250b6ab9a5efeef5ddd66b3b7db6 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 8 Apr 2021 12:15:46 +0100 Subject: [PATCH 25/92] Add weight loading support. --- torchvision/models/detection/ssd.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 09e58c31275..d38b303d5a4 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -10,12 +10,17 @@ from .backbone_utils import _validate_trainable_layers from .transform import GeneralizedRCNNTransform from .. import vgg +from ..utils import load_state_dict_from_url from .retinanet import RetinaNet, RetinaNetHead, RetinaNetRegressionHead, _sum # TODO: Refactor to inherit properly __all__ = ['SSD'] # TODO: Expose public methods, include it in models and write unit-tests for them +model_urls = { + 'ssd300_vgg16_coco': None, # TODO: Add url with weights +} + class SSDHead(RetinaNetHead): def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int, positive_fraction: float, @@ -250,8 +255,8 @@ def _vgg_backbone(backbone_name: str, pretrained: bool, trainable_layers: int = return SSDFeatureExtractorVGG(backbone) -def ssd_vgg16(pretrained: bool = False, progress: bool = True, num_classes: int = 91, pretrained_backbone: bool = True, - trainable_backbone_layers: Optional[int] = None, **kwargs: Any): +def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: int = 91, + pretrained_backbone: bool = True, trainable_backbone_layers: Optional[int] = None, **kwargs: Any): trainable_backbone_layers = _validate_trainable_layers( pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3) @@ -262,5 +267,9 @@ def ssd_vgg16(pretrained: bool = False, progress: bool = True, num_classes: int backbone = _vgg_backbone("vgg16", pretrained_backbone, trainable_layers=trainable_backbone_layers) model = SSD(backbone, num_classes, **kwargs) # TODO: fix initializations in all new layers if pretrained: - pass # TODO: load pre-trained COCO weights + weights_name = 'ssd300_vgg16_coco' + if model_urls.get(weights_name, None) is None: + raise ValueError("No checkpoint is available for model {}".format(weights_name)) + state_dict = load_state_dict_from_url(model_urls[weights_name], progress=progress) + model.load_state_dict(state_dict) return model From f67db925e379e56eb3fdd8c7205597c4ec86dac1 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 8 Apr 2021 13:38:29 +0100 Subject: [PATCH 26/92] Support SSD512. --- torchvision/models/detection/ssd.py | 96 +++++++++++++++++------------ 1 file changed, 56 insertions(+), 40 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index d38b303d5a4..257a6422169 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -113,10 +113,15 @@ def __init__(self, in_channels: List[int], num_anchors: List[int], box_coder: de self._l1_loss = torch.nn.functional.smooth_l1_loss # TODO: Discuss/refactor this workaround +class SSDFeatureExtractor(nn.Module): + def __init__(self, aspect_ratios: List[List[int]]): + super().__init__() + self.aspect_ratios = aspect_ratios + + class SSD(RetinaNet): - def __init__(self, backbone: nn.Module, num_classes: int, + def __init__(self, backbone: SSDFeatureExtractor, num_classes: int, size: int = 300, image_mean: Optional[List[float]] = None, image_std: Optional[List[float]] = None, - aspect_ratios: Optional[List[List[int]]] = None, score_thresh: float = 0.01, nms_thresh: float = 0.45, detections_per_img: int = 200, @@ -125,9 +130,6 @@ def __init__(self, backbone: nn.Module, num_classes: int, positive_fraction: float = 0.25): nn.Module.__init__(self) - if aspect_ratios is None: - aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] - # Use dummy data to retrieve the feature map sizes to avoid hard-coding their values device = next(backbone.parameters()).device tmp_img = torch.empty((1, 3, size, size), device=device) @@ -135,17 +137,17 @@ def __init__(self, backbone: nn.Module, num_classes: int, out_channels = [x[1] for x in tmp_sizes] feature_map_sizes = [x[2] for x in tmp_sizes] - assert len(feature_map_sizes) == len(aspect_ratios) + assert len(feature_map_sizes) == len(backbone.aspect_ratios) self.backbone = backbone self.box_coder = det_utils.BoxCoder(weights=(10., 10., 5., 5.)) # Estimate num of anchors based on aspect ratios: 2 default boxes + 2 * ratios of feaure map. - self.num_anchors = [2 + 2 * len(r) for r in aspect_ratios] + self.num_anchors = [2 + 2 * len(r) for r in backbone.aspect_ratios] self.head = SSDHead(out_channels, self.num_anchors, num_classes, positive_fraction, self.box_coder) - self.anchor_generator = DBoxGenerator(size, feature_map_sizes, aspect_ratios) + self.anchor_generator = DBoxGenerator(size, feature_map_sizes, backbone.aspect_ratios) self.proposal_matcher = det_utils.Matcher(iou_thresh, iou_thresh) @@ -174,10 +176,9 @@ def _anchors_per_level(self, features: List[Tensor], HWA: int): return [hw * A for hw in num_anchors_per_level] -class SSDFeatureExtractorVGG(nn.Module): - # TODO: That's the SSD300 extractor. handle the SDD500 case as well. See page 11, footernote 5. - def __init__(self, backbone: nn.Module): - super().__init__() +class SSDFeatureExtractorVGG(SSDFeatureExtractor): + def __init__(self, backbone: nn.Module, extra: nn.ModuleList, aspect_ratios: List[List[int]]): + super().__init__(aspect_ratios) _, _, maxpool3_pos, maxpool4_pos, _ = (i for i, layer in enumerate(backbone) if isinstance(layer, nn.MaxPool2d)) # Patch ceil_mode for maxpool3 to get the same WxH output sizes as the paper @@ -187,10 +188,10 @@ def __init__(self, backbone: nn.Module): self.scale_weight = nn.Parameter(torch.ones(512) * 20) # Multiple Feature maps - page 4, Fig 2 of SSD paper - self.block1 = nn.Sequential( + self.features = nn.Sequential( *backbone[:maxpool4_pos] # until conv4_3 ) - self.block2 = nn.Sequential( + fc = nn.Sequential( *backbone[maxpool4_pos:-1], # until conv5_3, skip maxpool5 nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=True), # add modified maxpool5 nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6), # FC6 with atrous @@ -198,47 +199,62 @@ def __init__(self, backbone: nn.Module): nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1), # FC7 nn.ReLU(inplace=True) ) - self.block3 = nn.Sequential( + extra.insert(0, fc) + self.extra = extra + + def forward(self, x: Tensor) -> Dict[str, Tensor]: + # L2 regularization + Rescaling of 1st block's feature map + x = self.features(x) + rescaled = self.scale_weight.view(1, -1, 1, 1) * F.normalize(x) + output = [rescaled] + + # Calculating Feature maps for the rest blocks + for block in self.extra: + x = block(x) + output.append(x) + + return OrderedDict(((str(i), v) for i, v in enumerate(output))) + + +def _vgg_backbone(backbone_name: str, highres: bool, pretrained: bool, trainable_layers: int = 3): + backbone = vgg.__dict__[backbone_name](pretrained=pretrained).features + # SDD300 case - page 4, Fig 2 of SSD paper + extra = nn.ModuleList([ + nn.Sequential( nn.Conv2d(1024, 256, kernel_size=1), nn.ReLU(inplace=True), nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2), # conv8_2 nn.ReLU(inplace=True), - ) - self.block4 = nn.Sequential( + ), + nn.Sequential( nn.Conv2d(512, 128, kernel_size=1), nn.ReLU(inplace=True), nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2), # conv9_2 nn.ReLU(inplace=True), - ) - self.block5 = nn.Sequential( + ), + nn.Sequential( nn.Conv2d(256, 128, kernel_size=1), nn.ReLU(inplace=True), nn.Conv2d(128, 256, kernel_size=3), # conv10_2 nn.ReLU(inplace=True), - ) - self.block6 = nn.Sequential( + ), + nn.Sequential( nn.Conv2d(256, 128, kernel_size=1), nn.ReLU(inplace=True), nn.Conv2d(128, 256, kernel_size=3), # conv11_2 nn.ReLU(inplace=True), ) - - def forward(self, x: Tensor) -> Dict[str, Tensor]: - # L2 regularization + Rescaling of 1st block's feature map - x = self.block1(x) - rescaled = self.scale_weight.view(1, -1, 1, 1) * F.normalize(x) - output = [rescaled] - - # Calculating Feature maps for the rest blocks - for block in (self.block2, self.block3, self.block4, self.block5, self.block6): - x = block(x) - output.append(x) - - return OrderedDict(((str(i), v) for i, v in enumerate(output))) - - -def _vgg_backbone(backbone_name: str, pretrained: bool, trainable_layers: int = 3): - backbone = vgg.__dict__[backbone_name](pretrained=pretrained).features + ]) + aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] + if highres: + # Additional layers for the SDD512 case. See page 11, footernote 5. + extra.append(nn.Sequential( + nn.Conv2d(256, 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=3), # conv12_2 + nn.ReLU(inplace=True), + )) + aspect_ratios.append([2]) # Gather the indices of maxpools. These are the locations of output blocks. stage_indices = [i for i, b in enumerate(backbone) if isinstance(b, nn.MaxPool2d)] @@ -252,7 +268,7 @@ def _vgg_backbone(backbone_name: str, pretrained: bool, trainable_layers: int = for parameter in b.parameters(): parameter.requires_grad_(False) - return SSDFeatureExtractorVGG(backbone) + return SSDFeatureExtractorVGG(backbone, extra, aspect_ratios) def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: int = 91, @@ -264,7 +280,7 @@ def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: i # no need to download the backbone if pretrained is set pretrained_backbone = False - backbone = _vgg_backbone("vgg16", pretrained_backbone, trainable_layers=trainable_backbone_layers) + backbone = _vgg_backbone("vgg16", False, pretrained_backbone, trainable_layers=trainable_backbone_layers) model = SSD(backbone, num_classes, **kwargs) # TODO: fix initializations in all new layers if pretrained: weights_name = 'ssd300_vgg16_coco' From d19144d68fbc78e115531849bf35440ae69db0d2 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 8 Apr 2021 13:51:34 +0100 Subject: [PATCH 27/92] Change kernel_size to get output size 1x1 --- torchvision/models/detection/ssd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 257a6422169..bc10b36db9a 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -251,7 +251,7 @@ def _vgg_backbone(backbone_name: str, highres: bool, pretrained: bool, trainable extra.append(nn.Sequential( nn.Conv2d(256, 128, kernel_size=1), nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=3), # conv12_2 + nn.Conv2d(128, 256, kernel_size=4), # conv12_2 nn.ReLU(inplace=True), )) aspect_ratios.append([2]) From 661eb31f1c09aecd3002d8ba10771a03c8e0b861 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 8 Apr 2021 14:16:45 +0100 Subject: [PATCH 28/92] Add xavier init and refactoring. --- torchvision/models/detection/ssd.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index bc10b36db9a..e2c24859ff1 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -15,13 +15,20 @@ from .retinanet import RetinaNet, RetinaNetHead, RetinaNetRegressionHead, _sum # TODO: Refactor to inherit properly -__all__ = ['SSD'] # TODO: Expose public methods, include it in models and write unit-tests for them +__all__ = ['SSD', 'ssd300_vgg16'] # FIXME: Expose public methods in models and write unit-tests for them model_urls = { 'ssd300_vgg16_coco': None, # TODO: Add url with weights } +def _xavier_init(conv: nn.Module): + for layer in conv.children(): + if isinstance(layer, nn.Conv2d): + torch.nn.init.xavier_uniform_(layer.weight) + torch.nn.init.constant_(layer.bias, 0) + + class SSDHead(RetinaNetHead): def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int, positive_fraction: float, box_coder: det_utils.BoxCoder): @@ -74,6 +81,7 @@ def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: cls_logits = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): cls_logits.append(nn.Conv2d(channels, num_classes * anchors, kernel_size=3, padding=1)) + _xavier_init(cls_logits) super().__init__(cls_logits, num_classes) self.neg_to_pos_ratio = (1.0 - positive_fraction) / positive_fraction @@ -108,6 +116,7 @@ def __init__(self, in_channels: List[int], num_anchors: List[int], box_coder: de bbox_reg = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): bbox_reg.append(nn.Conv2d(channels, 4 * anchors, kernel_size=3, padding=1)) + _xavier_init(bbox_reg) SSDScoringHead.__init__(self, bbox_reg, 4) self.box_coder = box_coder self._l1_loss = torch.nn.functional.smooth_l1_loss # TODO: Discuss/refactor this workaround @@ -120,8 +129,8 @@ def __init__(self, aspect_ratios: List[List[int]]): class SSD(RetinaNet): - def __init__(self, backbone: SSDFeatureExtractor, num_classes: int, - size: int = 300, image_mean: Optional[List[float]] = None, image_std: Optional[List[float]] = None, + def __init__(self, backbone: SSDFeatureExtractor, size: int, num_classes: int, + image_mean: Optional[List[float]] = None, image_std: Optional[List[float]] = None, score_thresh: float = 0.01, nms_thresh: float = 0.45, detections_per_img: int = 200, @@ -192,14 +201,17 @@ def __init__(self, backbone: nn.Module, extra: nn.ModuleList, aspect_ratios: Lis *backbone[:maxpool4_pos] # until conv4_3 ) fc = nn.Sequential( - *backbone[maxpool4_pos:-1], # until conv5_3, skip maxpool5 nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=True), # add modified maxpool5 nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6), # FC6 with atrous nn.ReLU(inplace=True), nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1), # FC7 nn.ReLU(inplace=True) ) - extra.insert(0, fc) + _xavier_init(fc) + extra.insert(0, nn.Sequential( + *backbone[maxpool4_pos:-1], # until conv5_3, skip maxpool5 + fc, + )) self.extra = extra def forward(self, x: Tensor) -> Dict[str, Tensor]: @@ -216,7 +228,7 @@ def forward(self, x: Tensor) -> Dict[str, Tensor]: return OrderedDict(((str(i), v) for i, v in enumerate(output))) -def _vgg_backbone(backbone_name: str, highres: bool, pretrained: bool, trainable_layers: int = 3): +def _vgg_extractor(backbone_name: str, highres: bool, pretrained: bool, trainable_layers: int = 3): backbone = vgg.__dict__[backbone_name](pretrained=pretrained).features # SDD300 case - page 4, Fig 2 of SSD paper extra = nn.ModuleList([ @@ -255,6 +267,7 @@ def _vgg_backbone(backbone_name: str, highres: bool, pretrained: bool, trainable nn.ReLU(inplace=True), )) aspect_ratios.append([2]) + _xavier_init(extra) # Gather the indices of maxpools. These are the locations of output blocks. stage_indices = [i for i, b in enumerate(backbone) if isinstance(b, nn.MaxPool2d)] @@ -280,8 +293,8 @@ def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: i # no need to download the backbone if pretrained is set pretrained_backbone = False - backbone = _vgg_backbone("vgg16", False, pretrained_backbone, trainable_layers=trainable_backbone_layers) - model = SSD(backbone, num_classes, **kwargs) # TODO: fix initializations in all new layers + backbone = _vgg_extractor("vgg16", False, pretrained_backbone, trainable_layers=trainable_backbone_layers) + model = SSD(backbone, 300, num_classes, **kwargs) if pretrained: weights_name = 'ssd300_vgg16_coco' if model_urls.get(weights_name, None) is None: From dcdd04d0d418f1790a6d46b8426757b95a1ab0e3 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 8 Apr 2021 15:00:21 +0100 Subject: [PATCH 29/92] Adding unit-tests and fixing JIT issues. --- .../ModelTester.test_ssd300_vgg16_expect.pkl | Bin 0 -> 6925 bytes test/test_models.py | 1 + test/test_models_detection_negative_samples.py | 9 +++++++++ torchvision/models/detection/__init__.py | 1 + torchvision/models/detection/anchor_utils.py | 2 +- torchvision/models/detection/ssd.py | 7 +++++-- 6 files changed, 17 insertions(+), 3 deletions(-) create mode 100644 test/expect/ModelTester.test_ssd300_vgg16_expect.pkl diff --git a/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl b/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e2fc9c51aa97975bdd7e112c48234b57c593abd9 GIT binary patch literal 6925 zcmeHMcT|)|*I&w_A{M|Hng|va3m{5S;JH6QMG&Pa*jSM!RbhFaRWxE-F$$u91&o6F z#txE%SP+O{L5d}y2&iuqFd9WgM2!WM_gRgJ@*UrtlXJd&`Qw{&erI;d{mp&u%-lOW z0NjMmhb5w5-d@E zC!*>U8RgGSc~_|_5l-*L_c6B^VKr*h2#Zk`)}t*gEF>z^mH67gXsM*jWCJBm($&*- zkZT`T<<*jIuEwqkt0k&y0t2HWBZC9Eu&DX0s(VaOa7=K}xUfJ@RnmQkmqd+|^zf4O zoUS5QATTP1SGiXxuTiLSXr~daMw*YX9x3S^+Qqeulc;;Tc603$FVS#SjhASy36*p8 z;Uu4VN&3F!iu4Z%j$|dDzvKGtizY?Q5BdFrv^sr|xzz|O3yJnSdY#|sb>;MWYv0oM z<0SfClKxVEQxDfJT|I{A=7uQ<1af$Ma^my^JwBQ^vp)>@c04D?ZhuT-cisl_G~`_- zcwheC&JBMX?|uHM@&2s+Q*!_G`oC&#{l9sh{^@r7r|2hlN%Hh}>GOX5{Moqw*Zb$a ze(&||Y@binzwh^6{(brX^S@`Eb@nsw=b`g@l82?|!682Od@z;czl9~MPlctoaox#r zzVg!}qT(z_ghdzj+|%OtIowD#>TxYxq9Kd5;q9Gq8GFzA#$_`zsY`inRgSk}?UcCb+H8NNoys z*pQb+(F>Z%_8f$+|H;t`$F2h zg6i!fkOvJ&#&kzry9LIt(ZODw;}r6>FAj>@X-857l&xjdXO9-@TubT1HW9kyThULi zvZ!g#PP#w1jAnnYN8=(K;Ag*))-|}pP16vwW5ck2qCJ%D7J(Loz+r|MHxZ9L--hCB zvM*j$YEz5hQcSJ%p&z1Vp};Z_X^C-&%aPz%qJV7doiSHGo*E}j!=!ICpjY9IZ?{|E zu5A#$nkG&O^C9%hkKi|)ia>2 zRbrIuTTn>;5^S!RPmdmmFqazW`DABsE^o+jOD36{Ti{0TMhZ4_gx*mx)UVf5d-PId zYYNcZE1POU*D)c@s3}&%-27=)7`4WmGo#C417$eJwPwcGcVU&i>D@xp|CHEu;xl8c`8~$Mc0fHdT(MR`MXGVTtG%H(G)+Ygu-g_ zXopuLr7W$dFBIsXEgpDne3M%J1Mq6R1|}|-pd#}ueOA=Q6t@fEHr$4654l4> zEdcQ&qLDi@28H)5sm59xs*ahYwOoYIGrm~1(HEwNm%wsR1g;;mBr69ESP6De%o79D zElQ>7)hp@X$;&ip>kc{)q7LhR>lp6_OC)9ig1Lt+h}yHE!~E&Tp^riI;YafoRUVOg*U zZ!||y)FfxD*%3wSRm7NmBAkq0#UW~GDydq}#srnKH10?oBnN(`8RIhPSm6ZJ`rIay z?$glPr<(M(dr;15H4MmDMWffOBJQw;;T#wS=14WNJw7 zkMpVD(CzCc5V)jJno%GIWmePYsxj#KVD1K@hMR0bWt&!#;>jp#jnsgs#2Hj) zfvRghXyrU8__7hsZNq43Ot~geZR8x$pvo_F-dCSoL<`C^aKGja{Q$w@XGH0#V zFh|Ea!d#q8vyTSgSbYv19V5g!CuOAnYJ_1!G~pR9g2<+t%$%KZWKb)uKje(90S?G5 z=VQoZcO>xsuc~bV`kF7rqf#GidZLZwYsvKBlr1Wq`0=inP==#6PA?WBZGs4^W1moX z(MZP6+z=bIe`I^ie2F7U9mUAvhfzgW@Uf5a!4*de3+}jde$9 zqjDNE@ta$;u)r6kQ!g+xeu~3(^yhu74f4LV!QIhf2#Q_6_HL%N-o6M6(#D~q;cw%Q zO3z#hP3&M)Weud2C8K4L7Rc;TMiCZ{2)~#@;z4=Tf_Bmu_o8|5*h7rC^LN>s@eMc$xaY z3d5)!kEvg=4&29A(^{D=lA2qn^k*?*2Zh1p$`ZtEU4)*i-7#{K22|!r&|+@@?*oHj z8mmKTi+TUfdDH$5A+8Bz^kSM31ob1E6xqPN?G4SE;DSDTcT;NHP{w%WQ_8Bl z%rG;XDeZMfbq&x%#d(x zXf2*kF5QfwGuE0OWq2e0MFt%c24QQ274<91qf2#}l-N>3rB_UG{Z<%4B$IKVfsaQs zHqt8BOe!+GLI({B$Yk&-CiF%!b6Aa~dFjfuVYG})AL`OT-LtgL-iuxo9;Cp^465*J zq3(tGG^DpXdOq;RjJ`2gx-$qW6CRRYst{fHJ#zkzCcie@Nx0P+7x}ni7ViUnG>~6r z20iMliCs)SZIX^71S>CUpmDDoiq{%o`Q{b$e6JB=N0-sq`xZFR#{pYR zUQ@)iKx`JNVR5z%20W~%&{txd9ruQee+sAQuzI?&^#{s5*}>bGkFUe3Ng*W;e*bz* zCfmy>{aaVxoEvI9b92lC-1 z0MGezb##0cJv+ai2|dvhS{D^*Cx6};W@~`eMLtN0G(}TJ4&5y=0=KA{zFK67nd~`6 z-*_IKJ2s!zhNM#H1_Nk2d!uQmJ$^YAh%el-C^yF&)@ChqBQ*%&QSRh3B$M_%&!RhL zj?>jslPE_slIrTSDJ^g#&E(_m=R-~4#9ClaupQR#os4v`JKFC{aB6@UPm89|kCxiB z^7cXcE`AzCxX(h;#AK!|dKv7;D%1E&yQoCb9}UynD2YFRSL!6w)$!dCP+d#n=h{$< zinTj9Vs9Dwvhk-`TlC?)#YjefgFG3|IfQy z#=rLer+LsqJm07QGl46M^-)<_!4HGF|50hdj4=Y_M4r@!1swqWp9&>x~?W$5#z2T7knnM(i z%9k7-zu>U%F^4!FU3mmP;$ZQR|NVeNViSj)dmQ%iNadk+o5S5Y4$o^jcwgo)6L7IMfa;BfpH2X`JDkMQFi;-JN2;0_LFzT#k<%AqQSL+N@BLwWSz zQI^EPfyeiW92#UCM({Yff`i^d4!S#-3p z@OsFi^Z^UWeHQocvWWbJ#kM;vaEnE8Ba0~wEE4NkBwc55yN-nlkJcI%>v`G#ks@wxqWrlbw@}5Et4_5_w_WPU4hnM!f z@~56ZQj69fOux;BhpURp@ZWmqE1CS>SN`xMPw(>?ykX?YOZ#2U d%gLXb1X()o%JOCpSE2m%hnEow str: s = self.__class__.__name__ + '(' diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index e2c24859ff1..f5cbe7d227d 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -15,7 +15,7 @@ from .retinanet import RetinaNet, RetinaNetHead, RetinaNetRegressionHead, _sum # TODO: Refactor to inherit properly -__all__ = ['SSD', 'ssd300_vgg16'] # FIXME: Expose public methods in models and write unit-tests for them +__all__ = ['SSD', 'ssd300_vgg16'] model_urls = { 'ssd300_vgg16_coco': None, # TODO: Add url with weights @@ -92,6 +92,9 @@ def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, cls_logits = head_outputs['cls_logits'] for targets_per_image, cls_logits_per_image, matched_idxs_per_image in zip(targets, cls_logits, matched_idxs): + if targets_per_image['labels'].numel() == 0: # TODO: Check this handles empty labels properly + losses.append(torch.zeros((1, ), dtype=cls_logits_per_image.dtype, device=cls_logits_per_image.device)) + continue gt_classes_target = targets_per_image['labels'][matched_idxs_per_image] classification_loss = F.cross_entropy(cls_logits_per_image, gt_classes_target, reduce=False) @@ -225,7 +228,7 @@ def forward(self, x: Tensor) -> Dict[str, Tensor]: x = block(x) output.append(x) - return OrderedDict(((str(i), v) for i, v in enumerate(output))) + return OrderedDict([(str(i), v) for i, v in enumerate(output)]) def _vgg_extractor(backbone_name: str, highres: bool, pretrained: bool, trainable_layers: int = 3): From 5b5e8f8f5b98c5f0d1140abcd77360344d3bdd07 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 8 Apr 2021 16:43:45 +0100 Subject: [PATCH 30/92] Add a test for dbox generator. --- test/test_models_detection_anchor_utils.py | 34 +++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/test/test_models_detection_anchor_utils.py b/test/test_models_detection_anchor_utils.py index 872a57c1365..abedadd2a9b 100644 --- a/test/test_models_detection_anchor_utils.py +++ b/test/test_models_detection_anchor_utils.py @@ -1,7 +1,7 @@ from collections import OrderedDict import torch from common_utils import TestCase -from torchvision.models.detection.anchor_utils import AnchorGenerator +from torchvision.models.detection.anchor_utils import AnchorGenerator, DBoxGenerator from torchvision.models.detection.image_list import ImageList @@ -22,6 +22,14 @@ def _init_test_anchor_generator(self): return anchor_generator + def _init_test_dbox_generator(self): + size = 300 + feature_map_sizes = [1] + aspect_ratios = [[2]] + dbox_generator = DBoxGenerator(size, feature_map_sizes, aspect_ratios) + + return dbox_generator + def get_features(self, images): s0, s1 = images.shape[-2:] features = [torch.rand(2, 8, s0 // 5, s1 // 5)] @@ -59,3 +67,27 @@ def test_anchor_generator(self): self.assertEqual(tuple(anchors[1].shape), (9, 4)) self.assertEqual(anchors[0], anchors_output) self.assertEqual(anchors[1], anchors_output) + + def test_dbox_generator(self): + images = torch.randn(2, 3, 15, 15) + features = self.get_features(images) + image_shapes = [i.shape[-2:] for i in images] + images = ImageList(images, image_shapes) + + model = self._init_test_dbox_generator() + model.eval() + dboxes = model(images, features) + + dboxes_output = torch.tensor([ + [0.4650, 0.4650, 0.5350, 0.5350], + [0.4488, 0.4488, 0.5512, 0.5512], + [0.4505, 0.4753, 0.5495, 0.5247], + [0.4753, 0.4505, 0.5247, 0.5495] + ]) + + tol = 0.0001 + self.assertEqual(len(dboxes), 2) + self.assertEqual(tuple(dboxes[0].shape), (4, 4)) + self.assertEqual(tuple(dboxes[1].shape), (4, 4)) + self.assertTrue(dboxes[0].allclose(dboxes_output, atol=tol)) + self.assertTrue(dboxes[1].allclose(dboxes_output, atol=tol)) From c70980515ec83deb48419bb52a2bb50e8ce3359d Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 8 Apr 2021 16:45:38 +0100 Subject: [PATCH 31/92] Remove unnecessary import. --- test/test_models_detection_anchor_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_models_detection_anchor_utils.py b/test/test_models_detection_anchor_utils.py index abedadd2a9b..4d29744d28a 100644 --- a/test/test_models_detection_anchor_utils.py +++ b/test/test_models_detection_anchor_utils.py @@ -1,4 +1,3 @@ -from collections import OrderedDict import torch from common_utils import TestCase from torchvision.models.detection.anchor_utils import AnchorGenerator, DBoxGenerator From 2d0f267e8ab06336094fb0be232f46ec08e65e0f Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 9 Apr 2021 15:56:35 +0100 Subject: [PATCH 32/92] Workaround on GeneralizedRCNNTransform to support fixed size input. --- torchvision/models/detection/transform.py | 30 ++++++++++++++--------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/torchvision/models/detection/transform.py b/torchvision/models/detection/transform.py index 45779263c85..b7058babb38 100644 --- a/torchvision/models/detection/transform.py +++ b/torchvision/models/detection/transform.py @@ -30,25 +30,33 @@ def _resize_image_and_masks(image, self_min_size, self_max_size, target): else: im_shape = torch.tensor(image.shape[-2:]) - min_size = torch.min(im_shape).to(dtype=torch.float32) - max_size = torch.max(im_shape).to(dtype=torch.float32) - scale = torch.min(self_min_size / min_size, self_max_size / max_size) - - if torchvision._is_tracing(): - scale_factor = _fake_cast_onnx(scale) + if self_min_size == self_max_size: # TODO: Improve this workaround + # Fixed size output. Assume width / height the same. + size = (int(self_min_size), int(self_min_size)) + scale_factor = None + recompute_scale_factor = None else: - scale_factor = scale.item() + min_size = torch.min(im_shape).to(dtype=torch.float32) + max_size = torch.max(im_shape).to(dtype=torch.float32) + scale = torch.min(self_min_size / min_size, self_max_size / max_size) + + size = None + if torchvision._is_tracing(): + scale_factor = _fake_cast_onnx(scale) + else: + scale_factor = scale.item() + recompute_scale_factor = True - image = torch.nn.functional.interpolate( - image[None], scale_factor=scale_factor, mode='bilinear', recompute_scale_factor=True, - align_corners=False)[0] + image = torch.nn.functional.interpolate(image[None], size=size, scale_factor=scale_factor, mode='bilinear', + recompute_scale_factor=recompute_scale_factor, align_corners=False)[0] if target is None: return image, target if "masks" in target: mask = target["masks"] - mask = F.interpolate(mask[:, None].float(), scale_factor=scale_factor, recompute_scale_factor=True)[:, 0].byte() + mask = F.interpolate(mask[:, None].float(), size=size, scale_factor=scale_factor, + recompute_scale_factor=recompute_scale_factor)[:, 0].byte() target["masks"] = mask return image, target From 39abfb4091b6caf05e7e0eddda512dea3e413f0d Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 9 Apr 2021 16:00:29 +0100 Subject: [PATCH 33/92] Remove unnecessary random calls from the test. --- test/test_models_detection_anchor_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_models_detection_anchor_utils.py b/test/test_models_detection_anchor_utils.py index 4d29744d28a..0c51d0a0ad3 100644 --- a/test/test_models_detection_anchor_utils.py +++ b/test/test_models_detection_anchor_utils.py @@ -68,8 +68,8 @@ def test_anchor_generator(self): self.assertEqual(anchors[1], anchors_output) def test_dbox_generator(self): - images = torch.randn(2, 3, 15, 15) - features = self.get_features(images) + images = torch.zeros(2, 3, 15, 15) + features = [torch.rand(2, 8, 3, 3)] image_shapes = [i.shape[-2:] for i in images] images = ImageList(images, image_shapes) From 0b7eb439ab513b6f5b26c6304532b7b954334b6b Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 9 Apr 2021 16:06:31 +0100 Subject: [PATCH 34/92] Remove more rand calls from the test. --- test/test_models_detection_anchor_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_models_detection_anchor_utils.py b/test/test_models_detection_anchor_utils.py index 0c51d0a0ad3..5dbd0c42765 100644 --- a/test/test_models_detection_anchor_utils.py +++ b/test/test_models_detection_anchor_utils.py @@ -69,7 +69,7 @@ def test_anchor_generator(self): def test_dbox_generator(self): images = torch.zeros(2, 3, 15, 15) - features = [torch.rand(2, 8, 3, 3)] + features = [torch.zeros(2, 8, 3, 3)] image_shapes = [i.shape[-2:] for i in images] images = ImageList(images, image_shapes) From e74b4fe86f2885967d1c1a3a9f1acc2fb1210a25 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Sun, 11 Apr 2021 13:06:21 +0100 Subject: [PATCH 35/92] change mapping and handling of empty labels --- torchvision/models/detection/ssd.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index f5cbe7d227d..3c56e97be91 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -92,16 +92,17 @@ def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, cls_logits = head_outputs['cls_logits'] for targets_per_image, cls_logits_per_image, matched_idxs_per_image in zip(targets, cls_logits, matched_idxs): - if targets_per_image['labels'].numel() == 0: # TODO: Check this handles empty labels properly - losses.append(torch.zeros((1, ), dtype=cls_logits_per_image.dtype, device=cls_logits_per_image.device)) - continue - gt_classes_target = targets_per_image['labels'][matched_idxs_per_image] - classification_loss = F.cross_entropy(cls_logits_per_image, gt_classes_target, reduce=False) - - # Hard Negative Sampling foreground_idxs_per_image = matched_idxs_per_image >= 0 num_foreground = foreground_idxs_per_image.sum().item() + gt_classes_target = torch.zeros((cls_logits_per_image.size(0), ), dtype=targets_per_image['labels'].dtype, + device=targets_per_image['labels'].device) + gt_classes_target[foreground_idxs_per_image] = \ + targets_per_image['labels'][matched_idxs_per_image[foreground_idxs_per_image]] + classification_loss = F.cross_entropy(cls_logits_per_image, gt_classes_target, + reduce=False) + + # Hard Negative Sampling background_idxs_per_image = torch.logical_not(foreground_idxs_per_image) num_background = matched_idxs_per_image.size(0) - num_foreground num_negative = min(num_background, int(self.neg_to_pos_ratio * num_foreground)) From 3f0c99cb28f1e15e7a192fca838d6aeb2b9b1a5c Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Sun, 11 Apr 2021 13:40:26 +0100 Subject: [PATCH 36/92] Fix JIT warnings. --- torchvision/models/detection/ssd.py | 2 +- torchvision/models/detection/transform.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 3c56e97be91..691e1b10554 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -93,7 +93,7 @@ def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, for targets_per_image, cls_logits_per_image, matched_idxs_per_image in zip(targets, cls_logits, matched_idxs): foreground_idxs_per_image = matched_idxs_per_image >= 0 - num_foreground = foreground_idxs_per_image.sum().item() + num_foreground = foreground_idxs_per_image.sum() gt_classes_target = torch.zeros((cls_logits_per_image.size(0), ), dtype=targets_per_image['labels'].dtype, device=targets_per_image['labels'].device) diff --git a/torchvision/models/detection/transform.py b/torchvision/models/detection/transform.py index b7058babb38..a0d32de1464 100644 --- a/torchvision/models/detection/transform.py +++ b/torchvision/models/detection/transform.py @@ -30,17 +30,17 @@ def _resize_image_and_masks(image, self_min_size, self_max_size, target): else: im_shape = torch.tensor(image.shape[-2:]) + size: Optional[List[int]] = None + scale_factor: Optional[float] = None + recompute_scale_factor: Optional[bool] = None if self_min_size == self_max_size: # TODO: Improve this workaround # Fixed size output. Assume width / height the same. - size = (int(self_min_size), int(self_min_size)) - scale_factor = None - recompute_scale_factor = None + size = [int(self_min_size), int(self_min_size)] else: min_size = torch.min(im_shape).to(dtype=torch.float32) max_size = torch.max(im_shape).to(dtype=torch.float32) scale = torch.min(self_min_size / min_size, self_max_size / max_size) - size = None if torchvision._is_tracing(): scale_factor = _fake_cast_onnx(scale) else: From eb339401488a9a82b3b771c9e76ddc9890222bc7 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Sun, 11 Apr 2021 15:00:04 +0100 Subject: [PATCH 37/92] Speed up loss. --- torchvision/models/detection/ssd.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 691e1b10554..c3b3139d379 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -99,16 +99,16 @@ def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, device=targets_per_image['labels'].device) gt_classes_target[foreground_idxs_per_image] = \ targets_per_image['labels'][matched_idxs_per_image[foreground_idxs_per_image]] - classification_loss = F.cross_entropy(cls_logits_per_image, gt_classes_target, - reduce=False) + classification_loss = F.cross_entropy(cls_logits_per_image, gt_classes_target, reduce=False) # Hard Negative Sampling background_idxs_per_image = torch.logical_not(foreground_idxs_per_image) num_background = matched_idxs_per_image.size(0) - num_foreground - num_negative = min(num_background, int(self.neg_to_pos_ratio * num_foreground)) + num_negative = torch.min((self.neg_to_pos_ratio * num_foreground).to(dtype=num_background.dtype), + num_background) foreground_loss = classification_loss[foreground_idxs_per_image] - background_loss = classification_loss[background_idxs_per_image].sort(descending=True)[0][:num_negative] + background_loss = classification_loss[background_idxs_per_image].topk(num_negative, sorted=False)[0] losses.append((foreground_loss.sum() + background_loss.sum()) / max(1, num_foreground)) From c880de4489c39d6bb880f2246e82354f3dd55368 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Sun, 11 Apr 2021 15:53:37 +0100 Subject: [PATCH 38/92] Convert 0-1 dboxes to original size. --- torchvision/models/detection/anchor_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index f93b6850707..264245c96a9 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -215,6 +215,6 @@ def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Ten dtype, device = feature_maps[0].dtype, feature_maps[0].device dboxes = [] for i in range(len(image_list.image_sizes)): - dboxes_in_image = torch.tensor(self._dboxes, dtype=dtype, device=device) + dboxes_in_image = self.size * torch.tensor(self._dboxes, dtype=dtype, device=device) dboxes.append(dboxes_in_image) return dboxes From 0883889dd95aed26b4754bd10bd7a482c6adc8c9 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Sun, 11 Apr 2021 16:18:54 +0100 Subject: [PATCH 39/92] Fix warning. --- torchvision/models/detection/ssd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index c3b3139d379..0a076e6ef31 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -99,7 +99,7 @@ def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, device=targets_per_image['labels'].device) gt_classes_target[foreground_idxs_per_image] = \ targets_per_image['labels'][matched_idxs_per_image[foreground_idxs_per_image]] - classification_loss = F.cross_entropy(cls_logits_per_image, gt_classes_target, reduce=False) + classification_loss = F.cross_entropy(cls_logits_per_image, gt_classes_target, reduction='none') # Hard Negative Sampling background_idxs_per_image = torch.logical_not(foreground_idxs_per_image) From 7c56cc857b9a651bd9c9d34709d85e8b8fb3c282 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Sun, 11 Apr 2021 16:23:24 +0100 Subject: [PATCH 40/92] Fix tests. --- .../ModelTester.test_ssd300_vgg16_expect.pkl | Bin 6925 -> 6925 bytes test/test_models_detection_anchor_utils.py | 13 ++++++------- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl b/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl index e2fc9c51aa97975bdd7e112c48234b57c593abd9..2a103664663082411f46d22954e246f220ad6f4f 100644 GIT binary patch delta 3630 zcmaJ^2~bqmwr!w6WHQeN1Oz0ALugTE1X|_p)hYt!sWc8ij36o^lQa&1+FS(bZfFD= zG$;l|pA5DWs1b0$AqH`Xi4%$w7y}YS;s7WyycS=*Z~b5OZ`HX~yY4xs_P%@XwbxR( zE8KI-^qgHioLoE{Cp)_59gWogkL5IZ@?@o*hUk;+=IZF=%=Qlb#$xiwNo{fHk!IhA6+ij)WEDw>8tB8?6Z4kkTXY_r|VdK#$=*K zb+>7^#&Bk@c-kEQA-Vx?jUL-*aX2$;s8x{eQf5 zok7Dn!T;y^rP_>cKevF6Q4!s&nE->?89wf36?oLkrEqE&}I; z!dGyameuIPM#~U^+bkiktRUaIB$S%HrQ!n__;_DL&~(N{^0c10);W!mDtY8OoJBmo zBW{xn2g3Q7Gj25e%|xgg;ReAUO>}l9i>YUM=;z9je6ffUTAb0E{hZRGgy7iDr}whU z6icTlHoJqIOFigVML6QNs%TEJ7$xTg$h<1W#sy*E56y>fNhW4jv54hhAqA;dE0PiFGu(~%LUFQ>!l%v3`vXj)DbeV>3&_=IL2es~I5wL+LLAPch z92bwliLD`sx+WtVDT{1B6+NH!15Jx#@o<_JtoF$u?a@NQy)I_`lL;8N$D3~SvEUg8 z(qxkY9P7xU^2^NjA7DLL;Jr9MC%$Ztjn5&ueuW)vUxtOCa8+o^};kt5d%o%~?Z_*RUpi|L3xm5tFFIcRqX zq8Nn>BG%NbP>&8Ex`BZv#D)^G2Hm0 zp=BsQp_Ut(6IuB9xM1}!1#mtk#Gm#;#5T{RyqYK^ugIpm5|)KLOGe5Y1+Z5gARpH> zgbr&b_woYF>eq(Z`WhPhk0`V*(Se1=24qS5sC#W9-8^UqyU3liulGAz)+?gufes>G z7UdR&qskgMu`2^+=0UI($J6*B-WXDUjgFl5M>bPQyUa2$@2hw8{1*izf#>9O>S_b+ z`tbp23OguO$wTLo{mdhK0oLl8Q*_%%T*=o*&IlnqO0A&ZEG3&NxAQS{<|;(K2t-Un3Jhi?Qr))+IF#H-;p!Ir;zkh-U))Dg+dJss zSB4S`g{&Qnc<|m@k+Flae zuce1`4e>?mXdKRMpzlKcu}JcF2U(FA#+&BRo*D(VPEwHSIvJn6Z(}gbehS8d5*D18 zXzbgl9z+%_D$lt54pC$58H3YVhwp}I&K@6w@L zeKgnjFB1M!M6+xw$l%UN@>VrcV~IZWWiHs?;EL&9LY(kq(OZy!{p#6Kc#DPIKUZO{ zg9}O+Ioeu;uvSK++OL;#%w(9MG=%LUH%va)M|NjIB)G&kgaJ1Vi+4)VZdZWzY56dB z62osbA1!MXu(;}tI9tMPt{YXnjKVs*WZE&r3Y-je!M!A;ZaGCa_uZwMvB6+gn4skG zD(rq$Pd_jAWgr}ecBYScQmzl9&Fa+h-0<}q7OF24J_)Y->9McmIAeRenO7BB z6mB1aThVG<|05M=J_zs+aT77xV*J!*g5Fasii_S-fXD#d`F7Y=X9<=56zp|i;p!8H zAT=M@(*2ZK(cl7kT^6-;2vK@|KA8k1W6G!+3f`&4f=fZvBWWVDv9;9N(oY$!0obfB zgXCF#JUnmZAx)(sNoozrFTA6fe$AA-`6*L+v4UB>GKWgSN0RTQBKot&ovx2-r++P7 zONk~8?y*~lFk0itFo&tJ38TfvPK0Grmv82ca>*hJ&;5{DZ@>Ze0 zS^!=|3O=YubK@I16GzfOIi+lGq!TAqBwWeIycd2jT^9)JpF-g<5RZczVoa}BU~xW+ zS9a^kD9eRj3mZt4yMd&ssaQOsf^k-?Qe@YDTUun2)&rIRmPiE>aUvKiqhijLgj;7-sK{L+aM7{yQAG z$5-K_zHxB)b#kf!hm&fc>@u@9!GHD5+GPpim2t5%XxgAOQts9rrt~ql{Uo=Rn|pl{ zQv1yLV+IA#CqXvHUqkbw7rMG1-yF_o>gkR7m{FfeuK&IBqaVsHb5omceRUB3kH%8Q NTG(scP!?Gv{u?6zuJ!-` delta 3579 zcmZ8kd05S9-#?|(G{jKGbdow*I$0{|R8+suC)vupCf|8ZZ}y??L!{@$M@%_+?(eP34xhv9Mu zx${UzdB&0OZl79CBkYGekI)R%nf|#gw|5%uB-f~QhUobW?pvfeqT~F}fuk>VrgYJy z>&G8GEMBXtaW~M@dlNR_U*l=u+~fatmyh=tktva@ymc|tUXRnaC1ZW4F%6PJ+-#a- zxi)4fdS6dwL7E2@8}ndxEtpTXl%QyMJN8B!@$YVyOx$P5|JW?VcfJ9fBsqoX%1EXf z$e5ohXHvEo7nXYP;@MH0btpryIj3aB@h+IrnJZ*PsG+yd7j{qcpc0keK;_m5)+cnO zOZ0O79WG*7mq?xoG3Tvk@x1d|Nw-(M*dkYPgQq3+b{lg4o(R^)45RtUcxJa;#m1|q zm~z0780H6Axg!InUJzE&pL=^*VAt(#oaP*b#a(Q<>UAbW+A?+6KEZQO7wnTdsXldF z4%FxVT3t+Yma;u94{WpHL!BnH>$q`KaXw;}w!r2EJOKm zL)%bmp*efEu)lHuM(K43`kn#&u4NS(+@n}bBgT&E#`+F9$E`KxUei;U|5E@D$~v$k zO%+8wXEl27F=73rD*U=l#xF{oaPM6{n)mF)(?Qh;JZgrqQKK2;uEB;z1-*?e85keI z{S)1(=eC?!I-jFwDtVtN+;cdbmFWR|S1UoQWg@57`s0`A**xtS!mRZPOel=y$@L<* zxO;M*c?z0TlV@=9&qg$>@#Ep`az1tq<&oJkj_74U$;%0>X}0Ar$A|IV^I&$jR#21` z%ORP$Fnykn@wE@2f2<0ru1ZE8Z-<#rB5nG0=abp4tg(&4gkDN!i8V+H+K#$M0X*M% zA5$JjQtjM`xVnqN>J($DCYec*Q>Wwwb2;Xhe9tX4i}2!^jPuZlH&Z;h#Onh*w&ua! zUe4cpHX+Q;gJvg`>~*&R9WjY4Fcz`ps{-5%-ypQMwjnE4AL%zTpq^pGwzs#y*HY@F zUch~^5$iUKS=A(E$LtW6bdTkY9%>b{64bn6uOtU|2-2?-S?J`7^DpK6!8w2*3Z`?J zZWJ$uH=&zP2No%fd9&P*^Gae+P!Yjp*~KV4A5Xj0O4i(Sp-xQ#P2YL3Ho5>o>Ask? zuN4Dst^Sl_z61Jl>w-=};iA3eNsaJ*TZ}a)R7QOH@Fu)wKg0>CC*ulxQSY2T!&=2$ zzTg_BO_DLFuobm`crdprf-5e}<&2u;oYXH5ksB@u%Jq#{6=;qobpV5UDPZ3>i9Z#s z!N6@Tn6az{Q!hx^Wty0`^etKM-GP&%-MP`#khQjVgjFjdn5+t(hxG%Nvtw)(qN;~6 z)>og)Wp1>1Q-L7E3~bpU=EF?|FzyIP+dOY{gqL7YLJFEW3K1)$*UzfEDX8SNBpJic2T-L6pzX2m z={PWocTYOPX|xfYL^}}oQp)<}nfUg4G7g@(hRHcQaA1BfI``Wk_%%8*-A4!iZk-C> z{+%kJDD@Dm%0}VGKcjFstx{nAX}GVoM6vgKoc-1jt(T?j?HYqwQSWiiO$#RRQubYD z%FtHrH+yZu-R%jCYm`wHCgTTV2SiWy0Dgep`S!GxMPgErt2x zD(Ky7$Lqh-gew_RZrQ4fvMTVO<4g(p!jXWbotwk`2N*fnpCsV=~#<4QhuSL0agHF(-p<85;*!hZN$ zICsE`yESUqYu7X^3}Dz-quFQ2_w1hI!z*i=pz1tx5PfGE@rFkYzB5*G`s_`ZxlqN9 z4~MY-)O5Fx@EV@KPN;NVE1VePL3?F70#5| zv3#0>;zBh??iuH1wBZCadRfB6pC4e^=>S$uzbwqWm%#0`(88??ix0c-@kk{_)Q%YI4^9 z@4dT`**-+DPJV^_`fGwPs|D--@@D30Q$D$%bg;HVzGw*^w?h<4KMpYXmUj$wmZaVSL1QR_~l zDdQBnKj_4G$D7Cqwc*q`qxrI*4@=)GIPcOSEUb^NBR&-e@XR<%njI^~o;eEczuAdpQ7R>6o9l2n@dZkw4SDHTF?ah~^48J- zRy(;W4YX{H9NC*`5c@&*n{Ps3)=St0!Qbm5rcVk}f;>tWMKHEf^v#elw* z*x)`DZ%Yp%q&62dL9OUkT7tnn73}fMk2CwkF>z-o^(Q=sS*Dm@Xn)G3N5ze0~MKP4!~Wn-8$RYARmOQe$;S0qk9N!YRm*X<{p;M%i-Z zen&2`zKYao0le7ImE|pw^!Y-{;hpa6Q|7@F!+e;!-G<3mjM%i-o8{}IOxm&vZ}wU- zeq^-@W1h-+p!aBQwRw-IJ0aX6GvtZ_7xsVNfbe%pR*wAu>wA%iiD8L4)rTE{6Q0v?&JoNTsXO$YK z8$$VY`e@D%4AD}JDwuDyayura4zDk56vBV)L6a-G*r^pDLtUj@Q|8Z%1-5L?EyUvr zD=t~yf*+SVa@OLDg1PlVTs*l5>*i-7e3O(CPd_&AbmyOEL%7mAA4P?Jbhc~7@0p>D zj8?#Za31!($;YGfr*Ql1WE2`NK)tG=09hd#%+k{EH$!YVezBZ;!rZuV?-XV$73_E# z%d`EJd{s6Lzd1^f{O}--q|88+Vm8wzrVH&cD!Px+!?>%vP@y}Qjo-E-O)JZ5P1A9E zTsH<^zXjzR2@Rv;-3|`EhFA7ELaH`jKEnD(MP&6ql4$b)M;;rceWxM*T6O7mfF@5NR?DEHv(t{-4d$Jc-L^d$e0JbkJve^vPcN zOw!T)7^H#0$66%%M=Y--gDn1Gr2hZRk1=RkB;QzkE&AW%L+37;YiBPX?(|PXx5#ve LPOB!zbjg1KV1Ax& diff --git a/test/test_models_detection_anchor_utils.py b/test/test_models_detection_anchor_utils.py index 5dbd0c42765..8a7e177bbec 100644 --- a/test/test_models_detection_anchor_utils.py +++ b/test/test_models_detection_anchor_utils.py @@ -78,15 +78,14 @@ def test_dbox_generator(self): dboxes = model(images, features) dboxes_output = torch.tensor([ - [0.4650, 0.4650, 0.5350, 0.5350], - [0.4488, 0.4488, 0.5512, 0.5512], - [0.4505, 0.4753, 0.5495, 0.5247], - [0.4753, 0.4505, 0.5247, 0.5495] + [139.5000, 139.5000, 160.5000, 160.5000], + [134.6296, 134.6296, 165.3704, 165.3704], + [135.1508, 142.5754, 164.8492, 157.4246], + [142.5754, 135.1508, 157.4246, 164.8492] ]) - tol = 0.0001 self.assertEqual(len(dboxes), 2) self.assertEqual(tuple(dboxes[0].shape), (4, 4)) self.assertEqual(tuple(dboxes[1].shape), (4, 4)) - self.assertTrue(dboxes[0].allclose(dboxes_output, atol=tol)) - self.assertTrue(dboxes[1].allclose(dboxes_output, atol=tol)) + self.assertTrue(dboxes[0].allclose(dboxes_output)) + self.assertTrue(dboxes[1].allclose(dboxes_output)) From 218ca55e9084599659eb309249c4241157b6b98d Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Sun, 11 Apr 2021 18:26:29 +0100 Subject: [PATCH 41/92] Update comments. --- torchvision/models/detection/anchor_utils.py | 2 +- torchvision/models/detection/ssd.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index 264245c96a9..a352e71625b 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -165,7 +165,7 @@ class DBoxGenerator(nn.Module): def __init__(self, size: int, feature_map_sizes: List[int], aspect_ratios: List[List[int]], min_ratio: float = 0.15, max_ratio: float = 0.9, clip: bool = False): super().__init__() - self.size = size + self.size = size # TODO: Remove assumption that width == height self.feature_map_sizes = feature_map_sizes self.aspect_ratios = aspect_ratios diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 0a076e6ef31..2f9828932c0 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -234,7 +234,7 @@ def forward(self, x: Tensor) -> Dict[str, Tensor]: def _vgg_extractor(backbone_name: str, highres: bool, pretrained: bool, trainable_layers: int = 3): backbone = vgg.__dict__[backbone_name](pretrained=pretrained).features - # SDD300 case - page 4, Fig 2 of SSD paper + # SSD300 case - page 4, Fig 2 of SSD paper extra = nn.ModuleList([ nn.Sequential( nn.Conv2d(1024, 256, kernel_size=1), @@ -263,7 +263,7 @@ def _vgg_extractor(backbone_name: str, highres: bool, pretrained: bool, trainabl ]) aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] if highres: - # Additional layers for the SDD512 case. See page 11, footernote 5. + # Additional layers for the SSD512 case. See page 11, footernote 5. extra.append(nn.Sequential( nn.Conv2d(256, 128, kernel_size=1), nn.ReLU(inplace=True), From 36f53f5eeaf57f06130d9756b13baca3ecd145ea Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 12 Apr 2021 07:52:31 +0100 Subject: [PATCH 42/92] Fixing minor bugs. --- .../ModelTester.test_ssd300_vgg16_expect.pkl | Bin 6925 -> 6925 bytes torchvision/models/detection/ssd.py | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl b/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl index 2a103664663082411f46d22954e246f220ad6f4f..a15a1f90bab12be94e5eb1a878e440a059651d47 100644 GIT binary patch literal 6925 zcmb_h30RHk+dtYxiwdDdS`ea5Nj&Gd?>d&EWGyO{Qb{>=-uE2D6sgl z;D0qkfsr6AF(NvKjqE=yOcLfWYet+1^aVlRv=p^qv0$FKO|MCsT!ruiS!9w}EkcsO zB|3yMl9;$8-O$9ya3&^hTBsy4J}DtFly%Y-tM}?PNt;VZ;*??2BgGnDQglUe31O14 zU!1yP%}EBF#?fVf(;&xzjsuq6NlP(+h=fFL z$Oh5eh@v&3ReE%G8sO?SP;3~j?NcHVcNpW-*2iR#*vLnBk=R%st*B)p5qAs_cT(1h z3k#2oOA>efQtOWy`6k3q|8qj7Dia#$ILO^a+~rI8u78y8rYLWgsVr|U5nBX^EtxO} ze;;j~;O0wgOdEkf0smGXoS8uHn+MnbuLV^0CxW(rJw#(}4HWJFn-z~p(f(~+KEE~o zySjWHuT!;oKhJ!eXWq{QwW2n`d6n+dT9B|0}nauj*H|`F?nRYs~9f zW7Rxf$NPC(mHBvU%mqRaYg_u}H^zT6Mm)(vO;I`u z1i2`!T8W##&&EDowUzd%7u_+kI2#M6%v9PreHNm~H5(1&OF^M8I&&H#BC^r-g|{-E z^duB*YO`?S{9>wzH%uA^()U^TNe7hrvg{S`7_)G;h*9cY?`B}|r7XEgGH`WngjRI^|fNjw^?4 zW3%x=k)5)yiB}2c+#HIVf7iqnzEN1-V1?BNHn?hAPx^7FJ2qPUM2mKwf(`95@qw+n zGUsSa2^@~HqDMb8K;-j%^w`T_23euS)sDVTKMfCb_R~(g-La2+5w6Mz#Oc{gkg$-vdy z&6Hye-QtdO&t^jVd-imT&3;%rwSb;V?S}WKTGM$>{Za3B0qt&84iT3b9Mq_X*;nLP zZ0Ue&@5u4v@v+Jp_9-(_s*#C{?K73Oi=F}Y56Hwy)>&yY39&eByA>^pnvc5O3Tdpz zFbs?Bi1Wi{Vp2~hw3xq(R<7txd%o!fHaiRGP%kU6tn@_(gN@Le)wdJOSnyA)n?#@ z^}j-4jVG>LX@SQ+1Y*zc+%bRkG#q6VjCZ0_u=Kq)-h8?b?!R$B+lHUv)!P?Bc%FnG zKU}ApzNz^1eH1p_;ZX3)o zHN<#FGw9psieB^N(3s+YKb&K*uJ<^MeJ#f%m$taq?iJj!=!bFtF+%IjJ#c819rju8 zjD-=-xUnf7ujl+qkMzyL?JWtQv%v{>owTKi>xN;Y&0czL@<<##%9dVe3Bfm83h1=K zWgxndhSin&*!}z})Qz;nws$pAZ)n*Dtd)|k>uVlEeGvd3$-RRZV6R^l^C;ixXUpRM(7p{4B zi*DIJ7T1}(V}e~RtlJTcE4vus{`5JR)nZOPEHm(l!#Y-^ReVFTloTA=b2gq_nr)(Zz4-zFhvD|x{V?E4JybP+gu0nV=(le#Ef!he z#v!KAX-FRoZ-~Nm(fSx1laBMo*<-hnI_MH!3frFa!y`rNxW9iYzT?(c&pHzqc>Ca3 zhwj*O`gm+M^1}Kvv+&5SBd{@CgdMs|;wG*{)&T}7ZJ8p!%uC4@IOG~`}{sFu^ z?t#Zj{IGjd5ah%qq3QAMP@4QmXxBdiZG{!obD0>g{5l$hs~J4dHWNlWr=l>%12=aQ z;Zr{!-1sN}H9Ql*Fu?&$eAdBm{lU03Ae$EdI052w17XnfT`+ZJ7#_XQ1|TH~@3nUU zo0~>B#<~D5q*~+2(h^wE*%H;X_CUpSZ%o|%96YP`(3j)wa9L0nSfl2Q@A}w+t9&vF zj^{%c;T*hr_coj!5rUtuHbdJNv3O6<5t}BPgU@mqKHk zxp+WrH@v97PM6IsgOrM7a7)%ea_=H&u9%4(M!o*g5a(9!1!@dSZeNzjrvI_47vdZW9p!Cr5>tp zFhY|PuVB{W7Fu=A9HZ=K!ikx+!sZ4&v|Mw8E>$zfHWj+4x5N%tzU+n_<<7XZcqA6j zvqKY)$vC>n3G;k!3VYAd#3zT#Xzdzf>|9z#^AA`;_nO&YdpjLmMm+_h6O2ROnBby! zG911mjXq73s$;QlCkrraS&UuAro);U zA{3O%alCAXa!;+S4#T~hmV&=?5=s~Dgqp?>EI;)C(pN9QSJk>0_u@3%&W^*o?=&&$ zU>eG{`Qe>2IiS%~g3Yrof?M1wI27uDwRTl7@xTa7Yk3ay+F8MhXBsGW9}h=+nBw}1 z!7y{g5S-TZ1DJ=#JA4sHLHewQLwF{{JkD_Gw9J zFAym3Z~p&$Ic5BJ|9>)P%_YXa%^}t6=McNjbI7&Bvx#ZpY{IOcO^l|_Cc{H#leSGV zqFyc|=@(_h;-rjNACi%lT{6-+OGeTXWQ1*ylDRjeB=GSP>z&%$))6( zR7y@KNXdg(DY-gLN-l*K>*p5=7)Np>zv@>j5=atTX%b1;>& z{#N!;iDBx%1l|JVa$^AdMv5XV##t2C2E{jz>)wC4WAhj z*u;?b9O@ex67`B9AL|%0p_(E39NZr>#PA_QQtvZl$32EDy+Zobs3qyu)V#ww^j@L3ozJeiDOBgbG zF+-5U$sDeJ7DGzY7-E*pkg>BFq9f8MEleQ0l}2z=92Z&*~bzdlUeRu}LX3zKbVkR91uG(=LPv{X(Lzk@=t&Mba*eKChc)391YR;|T$o(*Z517l zsF)zp_w7#|xz@1eyi5-#`<@O?EKuuU@_6EyA0~N=13gp|WN&qeOWtnNFhIFezNcWhBatedH!@ zxT1oB!=eOv>$hCL^~ftGdfe|FGX1ba&QA6{dN|12yrpmZ8+|(!y;<@bdUH{3;UjOa z2(t5*HEBBa^gjCtHHkz8?@tb#nMD7Cf$RRe0^hdhLCyXcL|v`}RrvgEBzc$qzm7lq zt-W{gr`r2>`Jbxy-~Ru9t8a^cc%1&*_4uEbPi2Z!;h&P@-MsmC?f$3N%{%$t$@{T> zKK1`yxp(#7rT;(vXXe?*_T}9;d}=(i&X{8%?rYiiMTk81QTA1uI^nU}BbnHO^D8 z_%sDeNL8>w2@1A0PQeaNRj`&*6wE?YF!dM(Gn$}a!-M!gj+$W#w#r|@*cS>`?V(_G zP_Qzof<5f3U?naJcB7|)O?T$`b_#agR>5A{D3~=z&vpv7y^Vrp87f!{jHOmm-VdIP+X&j~hg`_OZxja5Uuzc_cEY2mHH6WU196+jvi8YdEqv^zMqRs$69C zmqg}&PGr76i0s92ky(|BY~?W?9}tqBTGcqHA`e? zGeveRLuA$*`W&U{B9n4#OBLCz*&?&&IH(kv*<_K`O%mC7QDl<@-mY9^M<%~zuR%A&*BKvW)$YMDjj})1PpU58g@b-s_ERSRF7b3gvC9-ZDxr0R3 zXQ0UT3=mlZxSUdv*|~|VqQA&)_Y>I)SCK{a71^a;Tux`+KPQnLw-;HL?jozU71@kC&+YX#=>Kw!se1SY>Pu=`a4i>egZ(w_y!?g;F_ZGrjT z5?Ja@fu&an>~6Whv^kz!6j%mF{sn=pKQFN4A36VNf$icrS|+e|97m1|tQp6lQh{Y3 z6`03ifju}Ru&x}Z4hYP0zre=q6PR6z!2F5@){`S`x4^FK64*eFecJ_AxQ)xjQNC4R z&o>F|$T`*vte`+(ga0M4ma7FOT_rHp`t*8caH9iX$SVBZ`ZW3E`c&<^J*d@R ztk1LCOzFbCw$L`np>svupf@DZt>^x9T(2BN63HTHzY4+8(?ztkG65UbUZEsALtI>9 zgQmr<7&NgT9yX*z4^U?LVrTy-{NNOdft!b;^R5Ev$XX@ThqtBP zU5cdpUguEpCI#j8Ehfo~7c}g85~j#3(XC4sG~-O*V3CZUb9>|aE1A$Ym<1Od8)zgi zgoau_IG?dX^U5SR8hJuil8GhV>uGPaGlgBW!sQ>PlI=@BB-GrOhO~;q#_&GW>`q5S z{+LXQV za&w~FmD#Y|6Gu}P#3SyrM3l|TM!(#26t6NNtdB(A>!sA@i|Lqf*9PTD)39uoHK8olr$)D!ah1Ux(xi>Vpu7`%2jRZKlj&8BOiUb~uZFIU2Ax*=-S z7s7s2C+wUz0+FYZsY8+yX`Y32kL@BBqr^3)0rS<#NUGC7?B$2jDL1>J%Zk2qra=iq zBVX!eoQ>_(skH96JLbR8KwwEEbY2_5xV(tgeC~_e3QeTU3&FN3B?1pE`7Aa{*<{v_Wy}mozrd6~(qSbj!j9VJT)fnCyzXE{V|XpZ-Q) zDxKLaRl??gAzi(i4dc)eRG{yNl%`4)_nC^=ZOIsyuZhZwhM2LX8yX*5p_hI-oqL{y zLZ5Q#cQ_Nm2_>o;12AHHHcYlep_ySB*)3F%fx14-3zxo`UqPpwu{__H{GEIer>+Uf z;c~kBzy?z;G}6ovZTQWJMD%WJBo)SC_cJq?9j+%2J$G1Z>%%HE6m>6>VBnR2jKD1T zdKOWJeT{To7ftM6)SCu6D=~as?3=afe|Z^Ioz}%eK3{8GoG_@L8ML;$!hy@LF(e3O z{#LlU@wN2WsU+-vIfu@@$i$_DdvtA@Cnk#?-XNVZXD^MtY(m${(#UK`C1GhfMdoV4 zIziw*E9B-4UURB5LR94#iVHiVU@Zn_n?`E_224HS4L|JBOZjZlEvbNJ&km1!uBh zUAUgyoMs`gc@_0smyN-VS}>hbM4ekk;`SJAv{jpq)T9Qgog~t^4OXxUUrMX%zooEx zKZ<%?O=Or#^K*i+)dJYLED@${{9!3fq^?c+qRD|%w0W-#Y0?5(W||0}&!5x1v+3|X zm_mC_meR7LS4rKyn&KB2qGrq*>2+&YOlsPOqAJ_rM3ycxTD!w#r8#uV9dPndm9##2 z1bW`;LyPYj(@{@f3ee6W)3`J$SZsr41IHu$fiEVM#zAkeNF}3Vv2j`%1@pDn;!F;; z8r49Ni>qnF=lbw>Zbh|U0Gd6PPynwJ)Kv+c{=sn3%0{3d;$^WWzHV1ai1Jr@EYO~?BH4gF!Cv-TF zPZOF3;#Y$c6n?Fe43;?J>UcB!)WjWm>kKe-%24UGyRP_U%Mk}2N90LUP`)NOy~MT0=Jm z?T}htsGuiWuc<57{U&qUH~O`Exk5U;rVq@c6g2dn4AO%Ec%WyOAn-$Z#_^7;Wwd7`9iy-1R5u2hC(ux$5G0S|-M< zo8#u(lW#i?tv6+Jp$*WxGldQk9{v)(NZWUsg{6amBy&wqeO1bGxGA&Lv5B7 z7L}NxP`4je*(%}W76gCp|L4@+k&Y{ML`q32-KuuS%0pj~vHvvmYhOeGOSujB-k<8e zDJRp;#dP~tBPHJU!ff4S+&wT6z0YT2R$(E16JJCrho93x&kCA9`;K(wk$kBUb7*d# zGU{=pgcf{bg;^N{t+8I1c*_sr1MBFisS?rqEpc~kGBT6;Lf0l3k6c0#_KSd6K?XIQ zZkTTqiwU1`Usti13S@xY%4*uG;fJ2?!Pqg(2rrb02s}UA0 zOQzBp8t}HiPBSZv@tE8DA6GU|eO5L)F4Tus>-+R%W;^ILnjv>dHyExFF#ke6^)=q0qV-5ivk4~BQ|MAThQKq22pk4bt)_QTdnEeCeN?7?j)p6@Td zKH3)(*TtdtI4>BiI7u}zfOUy3;tTv>JN}_`*0U7aawC=eGS`zqS69reorXe>x?_$759Me5perx6iNIQuxZf zwBpii7<(8Zo$t3JZ>*y$&K>bn&98KOf)iS0X1ZMv>nUZWooME_cxXIFYQ~XGCl|&zp#8<`~ABWvkz(( zsc}uc_dfISHUGy<@1FucWb)>(KYz{i_Gac|ruX&zAyX~Cru!6=>J0fY)BAc;J@xz% z&Y1qe^xJ58%e1w-{Z_(4!}j;Gs)r|4coz@nictlhHg98IPxaI!S>N_;T9xfB(^9?u N@H$!&Rr$AN{|9F~y9@vT diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 2f9828932c0..32e4aaed4e8 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -23,7 +23,7 @@ def _xavier_init(conv: nn.Module): - for layer in conv.children(): + for layer in conv.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.xavier_uniform_(layer.weight) torch.nn.init.constant_(layer.bias, 0) @@ -205,7 +205,7 @@ def __init__(self, backbone: nn.Module, extra: nn.ModuleList, aspect_ratios: Lis *backbone[:maxpool4_pos] # until conv4_3 ) fc = nn.Sequential( - nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=True), # add modified maxpool5 + nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=False), # add modified maxpool5 nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6), # FC6 with atrous nn.ReLU(inplace=True), nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1), # FC7 From fe953221539433b895b43d7ab353198e43f54069 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 12 Apr 2021 09:26:30 +0100 Subject: [PATCH 43/92] Introduce a custom DBoxMatcher. --- .../ModelTester.test_ssd300_vgg16_expect.pkl | Bin 6925 -> 6925 bytes torchvision/models/detection/_utils.py | 18 ++++++++++++++++++ torchvision/models/detection/ssd.py | 2 +- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl b/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl index a15a1f90bab12be94e5eb1a878e440a059651d47..3426b17a988d5aef85c5f0d026043b9f547f962e 100644 GIT binary patch delta 299 zcmeA*>ouFOjK|c%!pP9j%+k=@(rDtQG;S#0!r0JwvNB@`Tpq|bo7~K(#geyR=k&=N z8O;QcRGPUtaWZV)E*i|NBEulYkXV$Qky)0ipORRTsBdUufNY1GlMut^bsQIXVEQdV zCc8NqF-)Gwp}+YrV=kLGKZ7&ejIz|C;>`R!Hz#|rg2@JwMi3{YNm@-dk&>FcOHu^p qNRXq=Cf}0OQZQg;fSZ8L6_Y1OYRL--;Z_9ley)@@ouFOjK|2*(9F=x$impr$YA28G;Rpr*vP=te6liQ2u$A0(9*(Waxf;V$!LaleC&_A|*9>m!t@)n{G*JDFkpZ mz)eU=EJ@TiG%-MS#N-K*TJiy+xD-Krn=7Tw_+j!)sS*HXL`-J@ diff --git a/torchvision/models/detection/_utils.py b/torchvision/models/detection/_utils.py index 24dc9399fd6..4144e1495a1 100644 --- a/torchvision/models/detection/_utils.py +++ b/torchvision/models/detection/_utils.py @@ -1,6 +1,7 @@ import math import torch + from torch import Tensor from typing import List, Tuple @@ -344,6 +345,23 @@ def set_low_quality_matches_(self, matches, all_matches, match_quality_matrix): matches[pred_inds_to_update] = all_matches[pred_inds_to_update] +class DBoxMatcher(Matcher): + + def __init__(self, threshold): + super().__init__(threshold, threshold, allow_low_quality_matches=False) + + def __call__(self, match_quality_matrix): + matches = super().__call__(match_quality_matrix) + + # For each gt, find the prediction with which it has the highest quality + _, highest_quality_pred_foreach_gt = match_quality_matrix.max(dim=1) + matches[highest_quality_pred_foreach_gt] = torch.arange(highest_quality_pred_foreach_gt.size(0), + dtype=torch.int64, + device=highest_quality_pred_foreach_gt.device) + + return matches + + def overwrite_eps(model, eps): """ This method overwrites the default eps values of all the diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 32e4aaed4e8..523bfe4b0f7 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -162,7 +162,7 @@ def __init__(self, backbone: SSDFeatureExtractor, size: int, num_classes: int, self.anchor_generator = DBoxGenerator(size, feature_map_sizes, backbone.aspect_ratios) - self.proposal_matcher = det_utils.Matcher(iou_thresh, iou_thresh) + self.proposal_matcher = det_utils.DBoxMatcher(iou_thresh) if image_mean is None: image_mean = [0.485, 0.456, 0.406] From 0342e7ed072d8025ffe679fee398cfdc943cf478 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 12 Apr 2021 13:38:16 +0100 Subject: [PATCH 44/92] Minor refactoring --- torchvision/models/detection/ssd.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 523bfe4b0f7..0db8393b321 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -15,7 +15,7 @@ from .retinanet import RetinaNet, RetinaNetHead, RetinaNetRegressionHead, _sum # TODO: Refactor to inherit properly -__all__ = ['SSD', 'ssd300_vgg16'] +__all__ = ['SSD', 'SSDFeatureExtractor', 'ssd300_vgg16'] model_urls = { 'ssd300_vgg16_coco': None, # TODO: Add url with weights @@ -232,7 +232,7 @@ def forward(self, x: Tensor) -> Dict[str, Tensor]: return OrderedDict([(str(i), v) for i, v in enumerate(output)]) -def _vgg_extractor(backbone_name: str, highres: bool, pretrained: bool, trainable_layers: int = 3): +def _vgg_extractor(backbone_name: str, highres: bool, pretrained: bool, trainable_layers: int): backbone = vgg.__dict__[backbone_name](pretrained=pretrained).features # SSD300 case - page 4, Fig 2 of SSD paper extra = nn.ModuleList([ @@ -297,7 +297,7 @@ def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: i # no need to download the backbone if pretrained is set pretrained_backbone = False - backbone = _vgg_extractor("vgg16", False, pretrained_backbone, trainable_layers=trainable_backbone_layers) + backbone = _vgg_extractor("vgg16", False, pretrained_backbone, trainable_backbone_layers) model = SSD(backbone, 300, num_classes, **kwargs) if pretrained: weights_name = 'ssd300_vgg16_coco' From acdcd780d85f1801059e0a19b64b2be0258fe1b0 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Tue, 13 Apr 2021 20:12:47 +0100 Subject: [PATCH 45/92] Move extra layer definition inside feature extractor. --- torchvision/models/detection/ssd.py | 84 +++++++++++++++-------------- 1 file changed, 44 insertions(+), 40 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 0db8393b321..fd1798af044 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -190,8 +190,12 @@ def _anchors_per_level(self, features: List[Tensor], HWA: int): class SSDFeatureExtractorVGG(SSDFeatureExtractor): - def __init__(self, backbone: nn.Module, extra: nn.ModuleList, aspect_ratios: List[List[int]]): + def __init__(self, backbone: nn.Module, highres: bool): + aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] + if highres: + aspect_ratios.append([2]) super().__init__(aspect_ratios) + _, _, maxpool3_pos, maxpool4_pos, _ = (i for i, layer in enumerate(backbone) if isinstance(layer, nn.MaxPool2d)) # Patch ceil_mode for maxpool3 to get the same WxH output sizes as the paper @@ -204,6 +208,44 @@ def __init__(self, backbone: nn.Module, extra: nn.ModuleList, aspect_ratios: Lis self.features = nn.Sequential( *backbone[:maxpool4_pos] # until conv4_3 ) + + # SSD300 case - page 4, Fig 2 of SSD paper + extra = nn.ModuleList([ + nn.Sequential( + nn.Conv2d(1024, 256, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2), # conv8_2 + nn.ReLU(inplace=True), + ), + nn.Sequential( + nn.Conv2d(512, 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2), # conv9_2 + nn.ReLU(inplace=True), + ), + nn.Sequential( + nn.Conv2d(256, 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=3), # conv10_2 + nn.ReLU(inplace=True), + ), + nn.Sequential( + nn.Conv2d(256, 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=3), # conv11_2 + nn.ReLU(inplace=True), + ) + ]) + if highres: + # Additional layers for the SSD512 case. See page 11, footernote 5. + extra.append(nn.Sequential( + nn.Conv2d(256, 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=4), # conv12_2 + nn.ReLU(inplace=True), + )) + _xavier_init(extra) + fc = nn.Sequential( nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=False), # add modified maxpool5 nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6), # FC6 with atrous @@ -234,44 +276,6 @@ def forward(self, x: Tensor) -> Dict[str, Tensor]: def _vgg_extractor(backbone_name: str, highres: bool, pretrained: bool, trainable_layers: int): backbone = vgg.__dict__[backbone_name](pretrained=pretrained).features - # SSD300 case - page 4, Fig 2 of SSD paper - extra = nn.ModuleList([ - nn.Sequential( - nn.Conv2d(1024, 256, kernel_size=1), - nn.ReLU(inplace=True), - nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2), # conv8_2 - nn.ReLU(inplace=True), - ), - nn.Sequential( - nn.Conv2d(512, 128, kernel_size=1), - nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2), # conv9_2 - nn.ReLU(inplace=True), - ), - nn.Sequential( - nn.Conv2d(256, 128, kernel_size=1), - nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=3), # conv10_2 - nn.ReLU(inplace=True), - ), - nn.Sequential( - nn.Conv2d(256, 128, kernel_size=1), - nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=3), # conv11_2 - nn.ReLU(inplace=True), - ) - ]) - aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] - if highres: - # Additional layers for the SSD512 case. See page 11, footernote 5. - extra.append(nn.Sequential( - nn.Conv2d(256, 128, kernel_size=1), - nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=4), # conv12_2 - nn.ReLU(inplace=True), - )) - aspect_ratios.append([2]) - _xavier_init(extra) # Gather the indices of maxpools. These are the locations of output blocks. stage_indices = [i for i, b in enumerate(backbone) if isinstance(b, nn.MaxPool2d)] @@ -285,7 +289,7 @@ def _vgg_extractor(backbone_name: str, highres: bool, pretrained: bool, trainabl for parameter in b.parameters(): parameter.requires_grad_(False) - return SSDFeatureExtractorVGG(backbone, extra, aspect_ratios) + return SSDFeatureExtractorVGG(backbone, highres) def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: int = 91, From 6c3b3fa5f359e23edc52b4b587fa7a11448fdcc8 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Tue, 13 Apr 2021 21:43:21 +0100 Subject: [PATCH 46/92] handle no bias on init. --- torchvision/models/detection/ssd.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index fd1798af044..6b7f79273c5 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -26,7 +26,8 @@ def _xavier_init(conv: nn.Module): for layer in conv.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.xavier_uniform_(layer.weight) - torch.nn.init.constant_(layer.bias, 0) + if layer.bias is not None: + torch.nn.init.constant_(layer.bias, 0) class SSDHead(RetinaNetHead): From 9ad0634b6a47da1b89d8b13c3f39c685ed9dbe60 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 14 Apr 2021 16:17:50 +0100 Subject: [PATCH 47/92] Remove fixed image size limitation --- test/test_models_detection_anchor_utils.py | 14 +++---- torchvision/models/detection/anchor_utils.py | 44 ++++++++++++-------- torchvision/models/detection/ssd.py | 17 ++++---- torchvision/models/detection/transform.py | 41 ++++++++---------- 4 files changed, 59 insertions(+), 57 deletions(-) diff --git a/test/test_models_detection_anchor_utils.py b/test/test_models_detection_anchor_utils.py index 8a7e177bbec..e1abbc0bbd2 100644 --- a/test/test_models_detection_anchor_utils.py +++ b/test/test_models_detection_anchor_utils.py @@ -22,10 +22,8 @@ def _init_test_anchor_generator(self): return anchor_generator def _init_test_dbox_generator(self): - size = 300 - feature_map_sizes = [1] aspect_ratios = [[2]] - dbox_generator = DBoxGenerator(size, feature_map_sizes, aspect_ratios) + dbox_generator = DBoxGenerator(aspect_ratios) return dbox_generator @@ -69,7 +67,7 @@ def test_anchor_generator(self): def test_dbox_generator(self): images = torch.zeros(2, 3, 15, 15) - features = [torch.zeros(2, 8, 3, 3)] + features = [torch.zeros(2, 8, 1, 1)] image_shapes = [i.shape[-2:] for i in images] images = ImageList(images, image_shapes) @@ -78,10 +76,10 @@ def test_dbox_generator(self): dboxes = model(images, features) dboxes_output = torch.tensor([ - [139.5000, 139.5000, 160.5000, 160.5000], - [134.6296, 134.6296, 165.3704, 165.3704], - [135.1508, 142.5754, 164.8492, 157.4246], - [142.5754, 135.1508, 157.4246, 164.8492] + [6.9750, 6.9750, 8.0250, 8.0250], + [6.7315, 6.7315, 8.2685, 8.2685], + [6.7575, 7.1288, 8.2425, 7.8712], + [7.1288, 6.7575, 7.8712, 8.2425] ]) self.assertEqual(len(dboxes), 2) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index a352e71625b..8f37a0dea02 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -162,28 +162,26 @@ def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Ten class DBoxGenerator(nn.Module): - def __init__(self, size: int, feature_map_sizes: List[int], aspect_ratios: List[List[int]], - min_ratio: float = 0.15, max_ratio: float = 0.9, clip: bool = False): + def __init__(self, aspect_ratios: List[List[int]], min_ratio: float = 0.15, max_ratio: float = 0.9, + clip: bool = False): super().__init__() - self.size = size # TODO: Remove assumption that width == height - self.feature_map_sizes = feature_map_sizes self.aspect_ratios = aspect_ratios + num_outputs = len(aspect_ratios) # Estimation of default boxes scales # Inspired from https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_pascal.py#L311-L317 min_centile = int(100 * min_ratio) max_centile = int(100 * max_ratio) conv4_centile = min_centile // 2 # assume half of min_ratio as in paper - step = (max_centile - min_centile) // (len(feature_map_sizes) - 2) + step = (max_centile - min_centile) // (num_outputs - 2) centiles = [conv4_centile, min_centile] for c in range(min_centile, max_centile + 1, step): centiles.append(c + step) self.scales = [c / 100 for c in centiles] - # Default Boxes pre-calculation based on page 6 of SSD paper + self._wh_pairs = [] clamp01 = (lambda x: max(min(x, 1.0), 0.0)) if clip else (lambda x: x) - self._dboxes = [] - for k, f_k in enumerate(self.feature_map_sizes): + for k in range(num_outputs): # Adding the 2 default width-height pairs for aspect ratio 1 and scale s'k s_k = clamp01(self.scales[k]) s_prime_k = clamp01(math.sqrt(self.scales[k] * self.scales[k + 1])) @@ -196,25 +194,35 @@ def __init__(self, size: int, feature_map_sizes: List[int], aspect_ratios: List[ h = clamp01(self.scales[k] / sq_ar) wh_pairs.extend([(w, h), (h, w)]) - # Now add the default boxes for each width-height pair - for i, j in itertools.product(range(f_k), repeat=2): - cx = (i + 0.5) / f_k - cy = (j + 0.5) / f_k - self._dboxes.extend([cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h] for w, h in wh_pairs) + self._wh_pairs.append(wh_pairs) def __repr__(self) -> str: s = self.__class__.__name__ + '(' - s += 'size={size}' - s += ', feature_map_sizes={feature_map_sizes}' - s += ', aspect_ratios={aspect_ratios}' + s += 'aspect_ratios={aspect_ratios}' s += ', scales={scales}' s += ')' return s.format(**self.__dict__) def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Tensor]: + grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps] + image_size = image_list.tensors.shape[-2:] dtype, device = feature_maps[0].dtype, feature_maps[0].device + + # Default Boxes calculation based on page 6 of SSD paper + default_boxes: List[List[float]] = [] + for k, f_k in enumerate(grid_sizes): + # Now add the default boxes for each width-height pair + for i in range(f_k[1]): + cx = (i + 0.5) / f_k[1] + for j in range(f_k[0]): + cy = (j + 0.5) / f_k[0] + default_boxes.extend([[cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h] + for w, h in self._wh_pairs[k]]) + dboxes = [] - for i in range(len(image_list.image_sizes)): - dboxes_in_image = self.size * torch.tensor(self._dboxes, dtype=dtype, device=device) + for _ in image_list.image_sizes: + dboxes_in_image = torch.tensor(default_boxes, dtype=dtype, device=device) + dboxes_in_image[:, 0::2] *= image_size[1] + dboxes_in_image[:, 1::2] *= image_size[0] dboxes.append(dboxes_in_image) return dboxes diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 6b7f79273c5..320d4830b85 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -134,7 +134,8 @@ def __init__(self, aspect_ratios: List[List[int]]): class SSD(RetinaNet): - def __init__(self, backbone: SSDFeatureExtractor, size: int, num_classes: int, + def __init__(self, backbone: SSDFeatureExtractor, num_classes: int, + min_size: int = 300, max_size: int = 600, image_mean: Optional[List[float]] = None, image_std: Optional[List[float]] = None, score_thresh: float = 0.01, nms_thresh: float = 0.45, @@ -146,12 +147,11 @@ def __init__(self, backbone: SSDFeatureExtractor, size: int, num_classes: int, # Use dummy data to retrieve the feature map sizes to avoid hard-coding their values device = next(backbone.parameters()).device - tmp_img = torch.empty((1, 3, size, size), device=device) + tmp_img = torch.empty((1, 3, min_size, min_size), device=device) tmp_sizes = [x.size() for x in backbone(tmp_img).values()] out_channels = [x[1] for x in tmp_sizes] - feature_map_sizes = [x[2] for x in tmp_sizes] - assert len(feature_map_sizes) == len(backbone.aspect_ratios) + assert len(out_channels) == len(backbone.aspect_ratios) self.backbone = backbone @@ -161,7 +161,7 @@ def __init__(self, backbone: SSDFeatureExtractor, size: int, num_classes: int, self.num_anchors = [2 + 2 * len(r) for r in backbone.aspect_ratios] self.head = SSDHead(out_channels, self.num_anchors, num_classes, positive_fraction, self.box_coder) - self.anchor_generator = DBoxGenerator(size, feature_map_sizes, backbone.aspect_ratios) + self.anchor_generator = DBoxGenerator(backbone.aspect_ratios) self.proposal_matcher = det_utils.DBoxMatcher(iou_thresh) @@ -169,8 +169,9 @@ def __init__(self, backbone: SSDFeatureExtractor, size: int, num_classes: int, image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] - self.transform = GeneralizedRCNNTransform(size, size, image_mean, image_std, - size_divisible=1) # TODO: Discuss/refactor this workaround + self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std, + # TODO: Discuss/refactor these workarounds + size_divisible=1, exceed_max_size=True) self.score_thresh = score_thresh self.nms_thresh = nms_thresh @@ -303,7 +304,7 @@ def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: i pretrained_backbone = False backbone = _vgg_extractor("vgg16", False, pretrained_backbone, trainable_backbone_layers) - model = SSD(backbone, 300, num_classes, **kwargs) + model = SSD(backbone, num_classes, **kwargs) if pretrained: weights_name = 'ssd300_vgg16_coco' if model_urls.get(weights_name, None) is None: diff --git a/torchvision/models/detection/transform.py b/torchvision/models/detection/transform.py index a0d32de1464..0b616041e39 100644 --- a/torchvision/models/detection/transform.py +++ b/torchvision/models/detection/transform.py @@ -23,40 +23,34 @@ def _fake_cast_onnx(v): return v -def _resize_image_and_masks(image, self_min_size, self_max_size, target): - # type: (Tensor, float, float, Optional[Dict[str, Tensor]]) -> Tuple[Tensor, Optional[Dict[str, Tensor]]] +def _resize_image_and_masks(image, self_min_size, self_max_size, target, exceed_max_size): + # type: (Tensor, float, float, Optional[Dict[str, Tensor]], bool) -> Tuple[Tensor, Optional[Dict[str, Tensor]]] if torchvision._is_tracing(): im_shape = _get_shape_onnx(image) else: im_shape = torch.tensor(image.shape[-2:]) - size: Optional[List[int]] = None - scale_factor: Optional[float] = None - recompute_scale_factor: Optional[bool] = None - if self_min_size == self_max_size: # TODO: Improve this workaround - # Fixed size output. Assume width / height the same. - size = [int(self_min_size), int(self_min_size)] - else: - min_size = torch.min(im_shape).to(dtype=torch.float32) - max_size = torch.max(im_shape).to(dtype=torch.float32) - scale = torch.min(self_min_size / min_size, self_max_size / max_size) + min_size = torch.min(im_shape).to(dtype=torch.float32) + max_size = torch.max(im_shape).to(dtype=torch.float32) + scale = self_min_size / min_size + if not exceed_max_size: + scale = torch.min(scale, self_max_size / max_size) - if torchvision._is_tracing(): - scale_factor = _fake_cast_onnx(scale) - else: - scale_factor = scale.item() - recompute_scale_factor = True + if torchvision._is_tracing(): + scale_factor = _fake_cast_onnx(scale) + else: + scale_factor = scale.item() - image = torch.nn.functional.interpolate(image[None], size=size, scale_factor=scale_factor, mode='bilinear', - recompute_scale_factor=recompute_scale_factor, align_corners=False)[0] + image = torch.nn.functional.interpolate( + image[None], scale_factor=scale_factor, mode='bilinear', recompute_scale_factor=True, + align_corners=False)[0] if target is None: return image, target if "masks" in target: mask = target["masks"] - mask = F.interpolate(mask[:, None].float(), size=size, scale_factor=scale_factor, - recompute_scale_factor=recompute_scale_factor)[:, 0].byte() + mask = F.interpolate(mask[:, None].float(), scale_factor=scale_factor, recompute_scale_factor=True)[:, 0].byte() target["masks"] = mask return image, target @@ -73,7 +67,7 @@ class GeneralizedRCNNTransform(nn.Module): It returns a ImageList for the inputs, and a List[Dict[Tensor]] for the targets """ - def __init__(self, min_size, max_size, image_mean, image_std, size_divisible=32): + def __init__(self, min_size, max_size, image_mean, image_std, size_divisible=32, exceed_max_size=False): super(GeneralizedRCNNTransform, self).__init__() if not isinstance(min_size, (list, tuple)): min_size = (min_size,) @@ -82,6 +76,7 @@ def __init__(self, min_size, max_size, image_mean, image_std, size_divisible=32) self.image_mean = image_mean self.image_std = image_std self.size_divisible = size_divisible + self.exceed_max_size = exceed_max_size def forward(self, images, # type: List[Tensor] @@ -153,7 +148,7 @@ def resize(self, image, target): else: # FIXME assume for now that testing uses the largest scale size = float(self.min_size[-1]) - image, target = _resize_image_and_masks(image, size, float(self.max_size), target) + image, target = _resize_image_and_masks(image, size, float(self.max_size), target, self.exceed_max_size) if target is None: return image, target From 5a00a0c6f30ba5acfc314df45a59244c683f32b7 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 14 Apr 2021 17:14:51 +0100 Subject: [PATCH 48/92] Change initialization values for bias of classification head. --- torchvision/models/detection/ssd.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 320d4830b85..53e003e6fd6 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -1,3 +1,4 @@ +import math import torch import torch.nn.functional as F @@ -22,12 +23,12 @@ } -def _xavier_init(conv: nn.Module): +def _xavier_init(conv: nn.Module, bias_value: float = 0.0): for layer in conv.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.xavier_uniform_(layer.weight) if layer.bias is not None: - torch.nn.init.constant_(layer.bias, 0) + torch.nn.init.constant_(layer.bias, bias_value) class SSDHead(RetinaNetHead): @@ -78,11 +79,12 @@ def forward(self, x: List[Tensor]) -> Tensor: class SSDClassificationHead(SSDScoringHead): - def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int, positive_fraction: float): + def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int, positive_fraction: float, + prior_probability: float = 0.01): cls_logits = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): cls_logits.append(nn.Conv2d(channels, num_classes * anchors, kernel_size=3, padding=1)) - _xavier_init(cls_logits) + _xavier_init(cls_logits, -math.log((1 - prior_probability) / prior_probability)) super().__init__(cls_logits, num_classes) self.neg_to_pos_ratio = (1.0 - positive_fraction) / positive_fraction From 0347c3687c6c4696719b3a58ac4496546ec1c9fe Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 14 Apr 2021 20:06:43 +0100 Subject: [PATCH 49/92] Refactoring and update test file. --- .../ModelTester.test_ssd300_vgg16_expect.pkl | Bin 6925 -> 6925 bytes torchvision/models/detection/ssd.py | 9 ++++----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl b/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl index 3426b17a988d5aef85c5f0d026043b9f547f962e..9ae692b387bdfe7781ce3d6c624f9366e865a83a 100644 GIT binary patch delta 1052 zcmZ9LeN5B^9L5iKC=i>9XL{s}USx#^$BPGdzvFXvzdP=QLtgKAF$8%B1+E#AlG|A@ zZ2=105=BV~QzwBH4PqIU&L9=VVav=-mf@5^q>fq3A~Rd8@Amxh-Sg~u{`%A_>y-^h ze3jwKa7Ad;>Zr)5h{lf#|Ng^Nkr67TLZNU!?~&s3I1mxK+Wm&dDrw|Yp4JkqLTa$QTg@Cr zHG_NJM0R^SUo3X<*~v}pA9K<;R8HwYDf5?Yp!re>uQ*G% zv)k-U_}KAoPN6j|g;Gljr$>@`Iwu)NQZkJ{CE=5uMEb5oe996rS0!*LB7raO#8Z_U z&ylQnPA`sUta1(Bi`Jm-i{qFWhm<(_|FALqt&O%38>%52J#jX4W*e@)Ut;M!9LsNY zu?P>We0twXUauAZYAau*TWN^3BDY$3d(?u*ZVQ1~7CIyr6ys)&es5;sPBXy?W;Rco zus4{v8Ej&&%*3v_CZ4$%Ls)kV(vlcdh8U^`jOgl(OjQ^;_o|VajYb!(u|{HkHZWOW z;JV5{QlNpeLn7~=6}iV(_$(5cnJ1Duq-XtkJ!PGGZZzvr*66wJ zr{}1Lp02Ap{Q4iNOh=1VM`f^%{FiiWmFf85S1m0!w75sLj301m8F)*J`k^l5X<7cF z7Oz7P~YIC2A6`s*n$=xb}&PMB6;CbHZy&-7k157yQRqxY8v0 zNxIAZw#{`1`N`%#b{^qr62qG0;oaLkkNdjEmu~q7#X`AS`l#oyJm`P$`UknC{<7Hs jUy-j07=A(`$=Q@wTvAyOlAlwN6A~Jsn7!gI_doa-(hqzJ delta 1052 zcmYk*drXye9LI4$ym5<*C>+g&F)c>WImi(zdoRQz>o(LZwF{tdLMkrq`Tl;-f!cv$ zMkXelt+u8nWMZ|-V$fw;nNp-i>#{UThZAZ;a!$!Hlk6vTww~>^XV35Z-S_u=|M)b! zo88;?L}lmXxSURRu5(eYtK|c4@H;2VndJ_ohOUeVHe@*i4Wa4bmAgKECGdWz+yCGB zM58BfddS+~^FnnU38S?P?xZqq(_D0EEMvj-8kMLCi_=twYi5oq+Ak>zhZI$VioM4ayACOO_bXJl!r7%5cuVm} zyW&Wj;>DL0J6jdMZdKfDBsD2M+obR|n!5ivMdh;!8Wf$+D4y4f%o;`aQ;O41Dr&q6 zN3r6LB86*>;`8N-NghR7zM{-+>Mq6h3^Q!LBEz_nqWCsR5uTvviB|kLMbTpPhMC?F z#R}v6RbkDT5M}uPxFC4X30Fph2M2{1BWFN}Jt;Kw3mrcTFBy412>0~~tBwk9^a#ri z3!Xzl_NT&;PlWs~Vd{Hk&R$`4hfudih$=lt8jm_u*5j9Mf`MZ z7LGLvbDkDf`h;j(ShrDd8jGrhxF>}44Z^{4Vf1lfy;pd*; ztAw@(gv1rX!a`y2ZlR?>_|hYE8gWa7JC_LkcbO%Ow8ehm#O=aSqdZrr&lVm^7am?9 zEKU`U&oh^$2;*~wq1nP!hp=y^*(g!?Fjlw}W41DGF-D>U+Zc#6wFseRnviPz9cF%Z z+~OT$w&D28BKNYz(myS7mW^8Myl4@9!J^`v#n};y<;L#cP2+DCbA~L&&sd}zg{Lec zPFi%Gu&6$6hW%vD?wLAmc=h3r0yj-|#|ATYErg!D_>j(wsQpoa;m8@+1uNI$=*e@n zCTDflPp+5}7*4YPjiNXCw(yBLyOYy`Sv8pq%yXm%!W}b$d7j93pg%e7hS>O!kdlq- Y%QvX9%+eC8B-6R@*8krUsC2yc58bqw$p8QV diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 53e003e6fd6..05727978917 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -136,8 +136,7 @@ def __init__(self, aspect_ratios: List[List[int]]): class SSD(RetinaNet): - def __init__(self, backbone: SSDFeatureExtractor, num_classes: int, - min_size: int = 300, max_size: int = 600, + def __init__(self, backbone: SSDFeatureExtractor, size: int, num_classes: int, image_mean: Optional[List[float]] = None, image_std: Optional[List[float]] = None, score_thresh: float = 0.01, nms_thresh: float = 0.45, @@ -149,7 +148,7 @@ def __init__(self, backbone: SSDFeatureExtractor, num_classes: int, # Use dummy data to retrieve the feature map sizes to avoid hard-coding their values device = next(backbone.parameters()).device - tmp_img = torch.empty((1, 3, min_size, min_size), device=device) + tmp_img = torch.empty((1, 3, size, size), device=device) tmp_sizes = [x.size() for x in backbone(tmp_img).values()] out_channels = [x[1] for x in tmp_sizes] @@ -171,7 +170,7 @@ def __init__(self, backbone: SSDFeatureExtractor, num_classes: int, image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] - self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std, + self.transform = GeneralizedRCNNTransform(size, size, image_mean, image_std, # TODO: Discuss/refactor these workarounds size_divisible=1, exceed_max_size=True) @@ -306,7 +305,7 @@ def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: i pretrained_backbone = False backbone = _vgg_extractor("vgg16", False, pretrained_backbone, trainable_backbone_layers) - model = SSD(backbone, num_classes, **kwargs) + model = SSD(backbone, 300, num_classes, **kwargs) if pretrained: weights_name = 'ssd300_vgg16_coco' if model_urls.get(weights_name, None) is None: From 5661ac7b15872e63d046849478eff4dc49b31dd2 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 14 Apr 2021 20:19:12 +0100 Subject: [PATCH 50/92] Adding ResNet backbone. --- ...odelTester.test_ssd512_resnet50_expect.pkl | Bin 0 -> 6925 bytes test/test_models.py | 1 + torchvision/models/detection/ssd.py | 119 +++++++++++++++++- 3 files changed, 117 insertions(+), 3 deletions(-) create mode 100644 test/expect/ModelTester.test_ssd512_resnet50_expect.pkl diff --git a/test/expect/ModelTester.test_ssd512_resnet50_expect.pkl b/test/expect/ModelTester.test_ssd512_resnet50_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..248024a7bd9d7d9a1b73aa8b63a23ee9f52a3b31 GIT binary patch literal 6925 zcmeI1OK)366vuDgiTg;KrtepHHGx8sI*%5yXt7Z?mzFdYC?L_ub*v^})g9X@h*BvM zYE@vz!m2<*z?KaX6p3%Zjv^l405%AAED_wfGe6()$=FWIwj*7gJ9EzYpV!P>&v-uU z>JEbb{$NAf5ex;na(-p?T47>2SIv!IeX}^1*&e*s(AGO&SHTU{F}BzfAzUiGU8tze ze6 zUE>mabG%FO`=I<-+m{tNCh5ZFzD*ZLO+p3u^mfuXZ3`Dn~94u0$TK zbgu}DGqW=%Po&k3mA>%%s@i!z90-SRs-ZA-Q|-FFqOpdnYWITL6Jr%~mkPy-+H0_E zL@$=ET&@SSuQ8z6*>pNRt@a!E0~Y?EhClRv41c()jx4C7Yq|0Hu&@8)Z!Y}2+7Sf$ znQvOSL&3I&!cEi@Xu_Gx3?xMig7wo6pNBbkI0yW5Krg%JuRr(BNVMnZr`59C(`4D1 zWj>b~N|s%H58V~*LAo6U!Mj z;U9Iu4*5cl{DO}k7N$Ngm=93$#SZ}dlJOzWyg`mS;Kwm*5dXm;F8pEMQ9tk)CruU)ZId%y;U}egZyq z03ScFL!8u+c|jiN6A%89Kl~t1*k#?bPsn+J{p~g&sPrFP{2>42g>`_x%y;6UZq$K% zp+{c8*WahlFHU`SCIZq=tMAinlkb!HT|Jjcd%jQg4^@+wJL~UdgP@lE=j)GRYSL4S z-gT>GM{ZW*@zJv%*7<+DdoJc5zi}@6)wi$Majj$Ye*0u^{BCFv{Q2vySpLrMhHB)| zzozQ)F5cGTcQfkxboQ51cQapnFdge@dD9I34}Z_Z<3IgtC|vB{5B0^j(O+Y?8Lo`i zjYs?T*0G7$U#RP^j92!xY^!*E`FH(n$2Y#n`0!AU({HRF(%6;xkok1^{`m)DEi3uH zddYmFjXy5mmtU8k=;}8Uugmx8Cpt!+Iz#W5->Tjk>;Lfm^8pWQ-t`Pu=fkhhym&A= zwD?B6KA_~iImILFwd*4XZIs8ZrgDDCzSflU88rNr{C4x*wcl>OOTMv9T>kx9`sdE) zL|(%n~#ql z4^HtIesi*3YN!&VR|@oPDCCevQ<{mv~0AxxDmW z_vL<{)V`M0=bwwGubx?t4odoIe*PH@vi0-NwL-ZP{Wma|IsE`fo`#dqR7!jBK)N4mHO2m-Enps(69{DQqR ajP&B$hA-`1fM(B!J@LuAC(_pPM)p5#+XD*# literal 0 HcmV?d00001 diff --git a/test/test_models.py b/test/test_models.py index 41fb8f7ca8c..7f8fd1d4fcd 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -43,6 +43,7 @@ def get_available_video_models(): "keypointrcnn_resnet50_fpn": lambda x: x[1], "retinanet_resnet50_fpn": lambda x: x[1], "ssd300_vgg16": lambda x: x[1], + "ssd512_resnet50": lambda x: x[1], } diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 05727978917..ddbf8af3a5d 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -10,16 +10,17 @@ from .anchor_utils import DBoxGenerator from .backbone_utils import _validate_trainable_layers from .transform import GeneralizedRCNNTransform -from .. import vgg +from .. import vgg, resnet from ..utils import load_state_dict_from_url from .retinanet import RetinaNet, RetinaNetHead, RetinaNetRegressionHead, _sum # TODO: Refactor to inherit properly -__all__ = ['SSD', 'SSDFeatureExtractor', 'ssd300_vgg16'] +__all__ = ['SSD', 'SSDFeatureExtractor', 'ssd300_vgg16', 'ssd512_resnet50'] model_urls = { 'ssd300_vgg16_coco': None, # TODO: Add url with weights + 'ssd512_resnet50_coco': None, } @@ -148,7 +149,7 @@ def __init__(self, backbone: SSDFeatureExtractor, size: int, num_classes: int, # Use dummy data to retrieve the feature map sizes to avoid hard-coding their values device = next(backbone.parameters()).device - tmp_img = torch.empty((1, 3, size, size), device=device) + tmp_img = torch.zeros((1, 3, size, size), device=device) tmp_sizes = [x.size() for x in backbone(tmp_img).values()] out_channels = [x[1] for x in tmp_sizes] @@ -313,3 +314,115 @@ def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: i state_dict = load_state_dict_from_url(model_urls[weights_name], progress=progress) model.load_state_dict(state_dict) return model + + +class SSDFeatureExtractorResNet(SSDFeatureExtractor): + def __init__(self, backbone: resnet.ResNet): + aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] + super().__init__(aspect_ratios) + + self.features = nn.Sequential( + backbone.conv1, + backbone.bn1, + backbone.relu, + backbone.maxpool, + backbone.layer1, + backbone.layer2, + backbone.layer3, + backbone.layer4, + ) + + # Patch last block's strides to get valid output sizes + for m in self.features[-1][0].modules(): + if hasattr(m, 'stride'): + m.stride = 1 + + backbone_out_channels = self.features[-1][-1].bn3.num_features + extra = nn.ModuleList([ + nn.Sequential( + nn.Conv2d(backbone_out_channels, 256, kernel_size=1, bias=False), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2, bias=False), + nn.BatchNorm2d(512), + nn.ReLU(inplace=True), + ), + nn.Sequential( + nn.Conv2d(512, 256, kernel_size=1, bias=False), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2, bias=False), + nn.BatchNorm2d(512), + nn.ReLU(inplace=True), + ), + nn.Sequential( + nn.Conv2d(512, 128, kernel_size=1, bias=False), + nn.BatchNorm2d(128), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2, bias=False), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + ), + nn.Sequential( + nn.Conv2d(256, 128, kernel_size=1, bias=False), + nn.BatchNorm2d(128), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=3, bias=False), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + ), + nn.Sequential( + nn.Conv2d(256, 128, kernel_size=1, bias=False), + nn.BatchNorm2d(128), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=2, bias=False), + nn.ReLU(inplace=True), + ) + ]) + _xavier_init(extra) + self.extra = extra + + def forward(self, x: Tensor) -> Dict[str, Tensor]: + x = self.features(x) + output = [x] + + for block in self.extra: + x = block(x) + output.append(x) + + return OrderedDict([(str(i), v) for i, v in enumerate(output)]) + + +def _resnet_extractor(backbone_name: str, pretrained: bool, trainable_layers: int): + backbone = resnet.__dict__[backbone_name](pretrained=pretrained) + + # select layers that wont be frozen + assert 0 <= trainable_layers <= 5 + layers_to_train = ['layer4', 'layer3', 'layer2', 'layer1', 'conv1'][:trainable_layers] + if trainable_layers == 5: + layers_to_train.append('bn1') + for name, parameter in backbone.named_parameters(): + if all([not name.startswith(layer) for layer in layers_to_train]): + parameter.requires_grad_(False) + + return SSDFeatureExtractorResNet(backbone) + + +def ssd512_resnet50(pretrained: bool = False, progress: bool = True, num_classes: int = 91, + pretrained_backbone: bool = True, trainable_backbone_layers: Optional[int] = None, **kwargs: Any): + trainable_backbone_layers = _validate_trainable_layers( + pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3) + + if pretrained: + # no need to download the backbone if pretrained is set + pretrained_backbone = False + + backbone = _resnet_extractor("resnet50", pretrained_backbone, trainable_backbone_layers) + model = SSD(backbone, 512, num_classes, **kwargs) + if pretrained: + weights_name = 'ssd512_resnet50_coco' + if model_urls.get(weights_name, None) is None: + raise ValueError("No checkpoint is available for model {}".format(weights_name)) + state_dict = load_state_dict_from_url(model_urls[weights_name], progress=progress) + model.load_state_dict(state_dict) + return model From 9e1da629aa6751b9f18ab4dff4aa59f48966c645 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 15 Apr 2021 14:46:09 +0100 Subject: [PATCH 51/92] Minor refactoring. --- .../ModelTester.test_ssd300_vgg16_expect.pkl | Bin 6925 -> 6925 bytes ...odelTester.test_ssd512_resnet50_expect.pkl | Bin 6925 -> 6925 bytes torchvision/models/detection/anchor_utils.py | 8 ++++---- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl b/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl index 9ae692b387bdfe7781ce3d6c624f9366e865a83a..9bb8fdf48aaf74e23407690673c82f65b6348942 100644 GIT binary patch delta 3714 zcmZ8kc{r8p*ESEuj*@V26pBsA6s7Hb*B$yyrBG%{ibNW9QacIR8QX;;G&v18hD0Ta zP9j7~!=Xv1xi~Z*sXpy~=lj0vd#>ww-s@iLe%D&h`s2CQoo1b8oq1Hj+{WC@dd5s^ z>lri5vrdNni)UEPure3wi7)!sU}0@*Yc5O>=Sp1*YMm{t6u17@^Mm?U+)$_^VfM51 ziPgNw=15*)V+3!UW&|&%E1b6`I-K`hZY6JJ#|q&x2`iJM-hsSBM*?`ENdY`nGk;!` zwm&Z*(Uk}E3}?!dE!L)8sQEpg#HDIK{5F0yhL+D@+*8c6?7y+3Xe zAsr?(ld}<;$<7lpGTVjQWz~e&#_0U#Yald}w-Hvzf5#QV%u)OIT_I~GR2n-^WFM3> zs4|l`lMrS~>x`0rq{{R^y+qhD)_UCk?W!Z0?@_nlUmQq6s*M0*V%eg-P(>aYc>*x2 z%tTlqxd#742ymtP2PeRN51mR7pt0*AegB8)cEY_#0TkL+i}>wDSJ?BT@qA9ii~RXR zpk=iJBw1I9@T6PT$k`&mpSr={`F!}44AP!80`#}*hyp}asv$a5fXfj)5iXs6hDqax zlJuO+aRJ(H%ZjjakN~Y=DWYV~GcQr3zW^O8huH7M zdRd2)0#^W~e>RJ_Dja?)j++QDYpt)yU^!+J@(NS%zVE##fbSi3bg=bI!n6d0$+xo? z7N$Vqk`J1a5|L}R4gCsxv8+W83zQx*0c&iqPPdYsA-frxl!Rf|45(!a@$T|*CiSg7 z#1pQvJttkz@OLk1(^aO~OdQ#Kxl%UwwmjLrO2kS(A+@X(Lh9ZTT6i`QPs{(N9uFbX zUfrMo#UuppRKlTFA$IE+z_maC{ULWz(WaTQ_{PqGmAolTEqzeu+r+F0GC)k{b+*67 z67x<8Fiz&I=t2j!4w1KPIiqp3i@N1ISpI$U7xWGS6cu@j2)xK6^yFGMoew)mA5u7| z6uU_s+5*rW4N)>BvPHXjKBLJ_V11llvn}7M*mt)J7|yf9q)~B|H3+Dsn+?fmw6PTN z-){B5I`wF*RGx~E4=EV`hQUJf&*Zkbmud4-;^VYoHG5E@fSKTI1|#DutkBdB>SwX&YJd35ygG(-=3BJ#5<^q0lpLsS(riT8#)!>_Pn zB~tMA6=11mqNsqD{d?fYiJ+LGK=RJ4r>wYQO10L(iR?rqnegX9ai0(-+|$f(iI9xv zx3OOhib=wzf>9hLy%b(y8;6Y$Z?p-0ft|EZCLOwQlGxZ-z$_2xqouc-Sg#A8=yb3E zI~0w6bSTydHE{xzW<-c^$E^gU9ui=4l!6E=9y7%Cf^w!`AsS))Lv8Gf_rE~>?j)RS znTzR?JX}f)fkt&H6;yg_{*|h<68Oap;7g4Ll&-C1}j7}LQ z5gA^hcx3@zyFI4AQnJt&Iv+V3Jt1ql9uAfz7o{J}cZc+;ndjo|z?n=*H&WAJh= zZ7Ny>w=12D7I!7=9~@&%bOiL&tCOj_dXoI#pI~qK3{hZv1N$XQ3+1~#Xhf)q+>~2P z<{C4YDV4FQ_C-vTxi}xDNv-UeX9}=$J5PDpjf~}lN0h5{l z{);_!aa8fo{7{4*oPomP5=Lx~5(59Y!tT`6MB@Gox~{KDxBGgiwKtgiu?TRJcXO!w=~MIGC9;5nXS% zczQz#@u6kx-MyChdT9}P3H=c)c8T%(5RKQJWvt&_d#vGCyrQv(mV*YyWAnWgaGhy| z1vW|WU*~~$=9P4=QwB;#-)Q+QUCf#oj&_M8d`U@0T)YsNv>uH&`xwEN1e8javH9B^ zuy{BUrS7ZIe`zPYRTg7mT^*aP83!YKFM5<5N;Tng@OjdC3N-dXZBHu04_TIsdDR!$ zK zs*?D4RM9MYc&0T-qOV#O=E++4Ibk|9_uGQ0wa43Q_VjprD7c%liSwqKRbJl3_$3-r zb8j(w`GW(E94Mt<7nhE{4Ns`*X*h)o6Y+SYflA7S@Yr>gEnHv?vCdye+jRl__b_CT z8c!9^N+>g;hV;krccA@D2UAkG8{UmotikReoR>@`jf{Bs+-;(l%~ed5r4#0!YGZrj z7UJ%LxfnkBh&f}m1n(|ZvcWHh=e+z8jjE zyqp`v`Fw))>U>CHwp!FIoQU z#QxGWnmuBU&vmNkwb+0YljoqwX%}K0huFgBZHyq;k@U_TXTP=?LwT@+SwES9R^D;8 zN#{5{_Opg=>=jzO19+%1o?LHcQ0N>_dOZ6CDJ7f2dQhA;CN9F>hA2MiCLg5AHyOl_ zdqqLAO_X{lly*!npr(Po}IOxq4S<{8>w>0pSRe4@HgDC1&fsO;A%UVr#5o@IE@2 zIN!FTswaiq+zETFc^I->ggwK~sQ2Q+?GT?FjqRW|?pF#w!=P2}<6>U4- z$A0Zxj;3TAO5%sC#agdpq*AMjGbdZ9HC-3VDisu`q=o8JBXn}^AaK{1iTb#jpnMG3$ z%V043KD9rcg6@PUm`G|-Pge@~KiTY}2E$^8)9pYzj=D2pzp4Q;<*m(F$K_+Dm@PvhXiRBuY05&`=&9n{~m8Ex4wC`k1k6+~+z zN?IGH?`EMa(jLjTQYm)Qazx8C({R#Nw(CnQl`C!I6DQ&{{aF?Qqh21y<*Y;bJ{QCe z4AQN-cKQ^cg=gIo=vJ2?#}%(Bt0#?2r;LYZmLh_eeq_dk>%e(=4QrS%1&{A@a4y#Z z19yHxh=VS~Kf0l-!2($^zOZ{G1w-x==9H5few|m$Zsbg(P?biuIfU7`}ZJ7 zOWvcAx_GokkHL*J0ip*TL__0Rm+9n~`<%5<$f5^>)@0OHOz+dz;7D&8Im!vJ#r-n5 zFPKYpA#q3;ET=lTROAInLjThN>gk_=L(k7rozYMHtg9(NwnZ(;seP^5|zt2W~VCmKzC zb@X(!fXe+JXzGJKv@pjNc8_{#=|DKH+%|(^Tn6b^dtv_M%QV0pPgf%bhXZLUcQZZ^WtiggW*#X# z@j~9YY_f}KW?x60X2&eqMBOvpsqol*qz4XA?#m7M=rM=AEpQQ+5F1@`a^FueF_GRG_+OGpTK#{LD7K9T_g}pzlA6LO HjRXG;;5g|e delta 3741 zcmZ8kcUTo!@+Aj}0)wDL2_ia3PU7?3I`ACAh-)SqPy_@Fh@hemgnBlu8!MF!@psBHy1}YXIooa&RlxdzX^_ZP7cmoscCyev<3 z3pYl_PCL3Xnx|n9#oJXM$y>Q1lJ{0Af?F=*H1_8sA-pB&Av|5%U|y_IFmL_zKwjm+ z0G^*g0PpaT`Md%#cUOidtGsI#ry?7o?pEW=JLl)i<4^PDmG^L)WbL`LvR2xrwO%~? zGB2K7q!-T(UcA&w&$3sthSIX$U);GNIa3|~$8J3RDmPy9SvTGcUqz)mlN0nfD|t!5 z-+KMKl_>deZl#hHm#kvN{jOlb%}_|@Rw~EsFGz=!R2~vqI2_(3#)Q#|5`3l21WPN#C=QO1;Oq&h5IBjU zA=*aY_+{6wQdpQ6PHkQif82`&FmKPoyiFDCo4;rNG)m~t!dE4d@N31n^c-1OCy1Bu zogQWg5ppXF+3wB~gU$_qgcyop_o-77FqEqSvlKBD6Llqi!#rCIY?@8u^$l3$+3jT2 zUCfMMW}(R-MB;z>buCWM&Vpm}B}x3&m3@@{VJj{EEJDk#O%mIQ7elONO(yPzW)5TF zaFs`sV9+SST7&J9B=7Y@Q2khh7c){Mc-i_v+L$Q9E(G_u4rj2K9Wr{EuTf&0WqiEI9 zP@0=lLEDoG=xAsLb{;p76uaPvD>nR*Nve;m*%9*++PkobT}c^(#|zEaM*B%nYbs(U zrVSK!J05PmY7pNMp~Bb(dmo7K<)UzwB#Cu$3=-rrvBf%5f*sW~FllBcn%6o=aQuoW zEIDq<%EC8c#F%0h88`dEM&l3- zY>x!_n~_u%k7|+*P;K8%dh;R!XWObtL)g9z9=HCa4)1voZ})?)P7+-FBjIP|n9BBuIkB;B2C&NSaHpeMJB zS-!m|?U=R(`KyZP^>Y!{pB3Ri&ySL3vYaB&JGPpC>gp^A|LOFAO`EzH5g#t`odsgb z?W<=tcTdu^&9yA}bSFj4*+5&3wXsRq!1p_AfV8xGOySxDWRK}5&!~wk=aL3upOrD& z7X$pXkjqs`{m15~z;6ZI&nvvPN;? zVahNT>VkizogQi1Lp)Lwt)WjTW`Yy)Lsjsce~$&ZszPuyk(5IwfI>R?u|EVMdPNh9 zl39+M``=UX8xL@t5iWcVz&N2R4(wclAGJd8AR-yn{ffBvvXCD4*}$UbEp>fp<5O!K zzI?7@@_s3J*&hvUVTu<9!m_Yab1@D)tY$TubKtdQJ!#xpji>XbLQoq_FRm^@U&k?8 zucwJvI|G{7>jck@BI-@H!S3tv=omj6QQacMIV$6fWf$Ey`X150Yr*XBSnygbF=4+0 zio+am=u<4}vfs1w6SHu9a0MwHw1@E2Wec`?-!$}^pJCU7d@%h-3wCob6n#gE*pkV$ zB&bir>t=PB+}ME;OO2uYP#yw%FMMb>h08_(P14}u^_|m{F*g7Y(^~ij1*NQ|yphew z)1h~rCDa}`nT<Q+Q7v>E*c~Blr(?cXu#&`QCK%f>21WyFqU@+@YYRJaXr{`4`10 zuz%XfI0m^7Jf?|ViAWsfPt(o?# z?k9)55yRg-6$6`#B)`dk4{c;_o{Bfh&m{QRgAwQ%JsmfC+Uc*Nc#LS&!j}X0C^bhO z<~|ybjWk8qxE8wclK^Vxzr&QeML6DGLbi80sb$~`b;M}FzwivJ5E$Xm6g|?NG69Q) zJ>l3Fp^lKqbZne$jWIq-a9mtX$6kDo^JTIqnUsQ$!y{HR%fx0cU(B*G!MJ5}Frej$ z&TA`i{?vIov{(S2!ADf;Hw$yL^f0LQg^i*qsAQ`te_cG>6LewOy#>b`>`>=e%a4xh zVRh4Lnd4{&5+->(=kH?^aqB_@TkzB!ay9ly;}ub9B5+uIi<<34bTz<~jHaEUZp+z} zSS(LNc{WhEpGBkYcth!O4&AwQkd~A_p$+woq*wGaEj|!L&Re?4;>`)t37daAaX@jK}>?=-owkT&VJg$)`Zdj*f%g z#p6_+^o(ygDGV0;M&^;T3U}U5C;rZOlqzS^bO&Jy_>t~7JVt<*{=PW$Y$oJ9R*>cj z8|e7%qaV~KC#A%8t;9yZZc?@$cY z43KhL6dtMB;Zu+ymLHyju3~q@87g6U-&T4PXM-zAm86hoPDTf#@kdrZ*x93Bi~`t>NHc7b#zkkCNFfX*T!w>TvB`Bf#7*hNxC}#g4vr%WOt6W z4Azi(_YBH&a!2g!a8w)mp;tW){J?tB_(umdb83)1s0E!%U9|G~AZxj9h;Zu|x)jsS zALvno@t%4i+a_ZO>BbRI+iD5!-57i)a=_6FA5?6#gpPXRNW7)R;`(tRsu2baMRhB~4Wd0k2O7KYdKZ4=2*t%he+IJeVuly1Vr=nQ=+8>0JNxp8YSu(Fa|qIb}* z3vAGC*+TP5y^%K9N*h&7Dfg8eR=Lih^09i@-#D3Kyr*Evr`=?@AQpuxmN=rFg}EL; zqCkdOn)$(5{E!*62O{arb(($pG!4B|!0$I&>GXgn&F%gk!YHdEI$G!rk5!edBlb5o zVtxWOx=bhSL+|N{!gR=K8$;|6iJvs=VYPG;Tr2@soyjN(O~6CLxfqza6$`(vLi5sl zRIhoEiu=`ZruaThstSbRqfYv)vK&WjP-R30yjh{;DO7%Q6vrpuiTjg~*X@Bcn$*1Mh! zy1JF>?^g|DyBbu5T5+koS{h$0)C*(R-l@#wwyHN$<3{IOp}HP+jL!E&2$!qxmzKlM zV!awEjxDUz%a!HKLalUprCeEDsF$uTS8EHalbNtXtj84CG2~7q^lnGUkaWHhJx-};Xp7N zbleIvw~NJUrBW)^%hjvPnOACyrCMq6V!2q)gqxn73pdxpEpy@4`Cf6LSgl1aZ(E8y zTIybMEly0HotzH0FZBh#)WaQ@f`MS@W;hsRZiYK=FA3IAJ=`@H?vAl4h0CSNayYE9 zOhhkLuU=UXXpa-n#8fsr9q!fe`waYkfj@93hCf&j56y*#R|;dZL0|um7v6ka?odju z*`|d%ptht6H|`{GCO422@r%2k=apKst|%$-T+_pkuE)Hi|9oa-lzRNrx#(W0Q$L@} z-&-wPx`LzBqc4Z!HBFWJ{oTKEvOfN)rgDG$&ncsq#>@P1KUdp-GXL=J`?+sVoQ&~A zUJs4`;Dsac`or^@!o~055AMeN>5nyK-A8@#HTtXV`t-%N8?T)o;lFomJoX1FYkI8Z z-FV%($hC^smw(&Oc6`Ba%$Kan&~f|tcKx($BJbwI&8N-x_aBIBd6)02mz!_2@!#hA z@@w-G-6S5H@6%6obiSxF^uO|3)myOs58vMpc(~@>&u}H5e*N_RXzZ8P{K4NY#`^=> zPT%$?xtBZau*%$}X+Cy^y8Y$OMbqtP(6I0F+x5F`zg@q(d~G|4&*!h}pWV-izF=q^ zoA2v)o1b9I-Nw!{SKp3bP+UIy2r4pq5O*^7`Ab|r|I#=c#Y^mQ*X%s>`Qz?icMc|g zyL|Ehb@}b|-F&yw_wV=g^V_wD-w8!R=hw9l*C+3@PuxO%e0#t7_=z3uPrLJ`k&nmC z^wari)E7N2-}6`F;%B3N8Xs?vx8qNEa;I_I=Ra*f!Eo1^BeBuB$Z`44Q}&bZJahfG z&s$gCor|Zf^oa!f{_{4;zl-nA{rbe-`1?le+x7G5+x_qIH)fwGsaKlX_%5E-Y%cHm zFLU=iPinuGm)Dx_?|gFO#%87DYV}%ky4h<@{FXM8n@nD7Qtzm(CXvLap4bfj~hMYT<#WQ#b682l9nJd1ZewCyyfr0DSt1dXuj$Mvnb} zeey#5gGP>Z38bRc>+Fl03ScFL!8u+z90|Gi3fklAAXQ0?6U7UC)~ck{#FwZ)b$@+ z{2>42g?)g(^gHoTH|jvX&?7J4%k$}*x2L|o5CO^6{Q0y^<=3B2tEJj<^xwct?!4!b z)avJA18*uL9{l>%mDf6>%*ZtxbO>4hPsRRdPZfl(hx-qUt8M=_W{+siAEHv(+U{q5 z7s-B#r;+YP#cYBU|FU+G?0lz@ve7ngh9tiMTqHYhR9r{*AbOC4WU4hA^!Cy=hq~AZ kN^!(NUpGBGXP%jnUcB4BGkZbUvq4YXc|DP~$ZOeu0rDOd5C8xG literal 6925 zcmeI1OK)366vuDgiTg;KrtepHHGx8sI*%5yXt7Z?mzFdYC?L_ub*v^})g9X@h*BvM zYE@vz!m2<*z?KaX6p3%Zjv^l405%AAED_wfGe6()$=FWIwj*7gJ9EzYpV!P>&v-uU z>JEbb{$NAf5ex;na(-p?T47>2SIv!IeX}^1*&e*s(AGO&SHTU{F}BzfAzUiGU8tze ze6 zUE>mabG%FO`=I<-+m{tNCh5ZFzD*ZLO+p3u^mfuXZ3`Dn~94u0$TK zbgu}DGqW=%Po&k3mA>%%s@i!z90-SRs-ZA-Q|-FFqOpdnYWITL6Jr%~mkPy-+H0_E zL@$=ET&@SSuQ8z6*>pNRt@a!E0~Y?EhClRv41c()jx4C7Yq|0Hu&@8)Z!Y}2+7Sf$ znQvOSL&3I&!cEi@Xu_Gx3?xMig7wo6pNBbkI0yW5Krg%JuRr(BNVMnZr`59C(`4D1 zWj>b~N|s%H58V~*LAo6U!Mj z;U9Iu4*5cl{DO}k7N$Ngm=93$#SZ}dlJOzWyg`mS;Kwm*5dXm;F8pEMQ9tk)CruU)ZId%y;U}egZyq z03ScFL!8u+c|jiN6A%89Kl~t1*k#?bPsn+J{p~g&sPrFP{2>42g>`_x%y;6UZq$K% zp+{c8*WahlFHU`SCIZq=tMAinlkb!HT|Jjcd%jQg4^@+wJL~UdgP@lE=j)GRYSL4S z-gT>GM{ZW*@zJv%*7<+DdoJc5zi}@6)wi$Majj$Ye*0u^{BCFv{Q2vySpLrMhHB)| zzozQ)F5cGTcQfkxboQ51cQapnFdge@dD9I34}Z_Z<3IgtC|vB{5B0^j(O+Y?8Lo`i zjYs?T*0G7$U#RP^j92!xY^!*E`FH(n$2Y#n`0!AU({HRF(%6;xkok1^{`m)DEi3uH zddYmFjXy5mmtU8k=;}8Uugmx8Cpt!+Iz#W5->Tjk>;Lfm^8pWQ-t`Pu=fkhhym&A= zwD?B6KA_~iImILFwd*4XZIs8ZrgDDCzSflU88rNr{C4x*wcl>OOTMv9T>kx9`sdE) zL|(%n~#ql z4^HtIesi*3YN!&VR|@oPDCCevQ<{mv~0AxxDmW z_vL<{)V`M0=bwwGubx?t4odoIe*PH@vi0-NwL-ZP{Wma|IsE`fo`#dqR7!jBK)N4mHO2m-Enps(69{DQqR ajP&B$hA-`1fM(B!J@LuAC(_pPM)p5#+XD*# diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index 8f37a0dea02..f50fd0ea336 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -212,10 +212,10 @@ def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Ten default_boxes: List[List[float]] = [] for k, f_k in enumerate(grid_sizes): # Now add the default boxes for each width-height pair - for i in range(f_k[1]): - cx = (i + 0.5) / f_k[1] - for j in range(f_k[0]): - cy = (j + 0.5) / f_k[0] + for j in range(f_k[0]): + cy = (j + 0.5) / f_k[0] + for i in range(f_k[1]): + cx = (i + 0.5) / f_k[1] default_boxes.extend([[cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h] for w, h in self._wh_pairs[k]]) From 61482c606834e54a528443dc639eae1973ad4524 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 16 Apr 2021 22:09:21 +0100 Subject: [PATCH 52/92] Remove inheritance of retina and general refactoring. --- .../ModelTester.test_ssd300_vgg16_expect.pkl | Bin 6925 -> 6925 bytes ...odelTester.test_ssd512_resnet50_expect.pkl | Bin 6925 -> 6925 bytes torchvision/models/detection/retinanet.py | 20 +- torchvision/models/detection/ssd.py | 272 +++++++++++++++--- 4 files changed, 237 insertions(+), 55 deletions(-) diff --git a/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl b/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl index 9bb8fdf48aaf74e23407690673c82f65b6348942..4c79e50d862797e661a02d688a275f19410748db 100644 GIT binary patch literal 6925 zcmc&(2~?Cv@*hC15kyfUhX;r#7)3x-FtpBBKMxQTLn$?7a1XNU%?9N~|iR|CqxBqP3_IvgE>;9^KUDegqr;DRf zA0AIljrUKfKTn(I9~B@Bi3=JZ=pW;69=SAhCiUZazmrxgej?(n5-FH^D$5WSMJx-7 z7AXeAM92iqePd%nLZkVnLxCR(?2*!)aR`T ze`aWef6V;1m3)!1rc2Vh+|+s&-Eo>Z_nEZU=jE%rUg#QF!>(?ti^5~Bm`R=)%6)$)Dp#iD&oao1j^vSKe=E!@i-pY36* zD)+F(CVSY0rRA*By_`)MQ_kj@ma}^8a`sTOoSj~|o4JSYX7?BFW~bhivCmt|Slw4; z>`ZMLD?C!hzDg-$4Z<>ZM^MHhns>4IH>GU&#ZsnlxRgbgmNIchDckZ%DJ!2>%Bme? z`QTEfC!;qirEE!a2@5$;!W1$}m|sc>V^$?>vsnpC8(zYy`j)VPKNqtF9mVX+`^C)R zY%zNxDP~ohikWw0F)Ir$W^RZHGXBV*v!-|;V z3khp!ldyr`OPJYTB}}_n!osdd*yHmOmRBKRlZqv5YrceyUnS#-kT4G`36qYIu#fvn zSi6#h#Xc%zU*9fdMYjss^RtCaTvN!ps|wk}1BI-yq>x2!FJ$ps3t3lYAqy20GQW!j z?8?~!*0;KVmF+BGMHvOGIIVysj3{6QeF|8EVgXC+&S%EW`E1XXd{$VS&mNu0XLGCb znbN_0rY+8AgIDA;g~j>o;N*N3HzuELG|6XnBl6j&xAT~PQy!aEo5yr2^4QaYJm!+0 z$2d_Qiw?lu?|FY@Xz>(7QQoTWJn^v^|HtTA#yq$K^12*j>7R ztjSO|=g8?_h20153%i~$pGn%kh25`nsC+7ur{4LXx7Ght-<$8xD*vzB|D$&G*1vc8 zkK+HQ^1a*dUH(t;{vH1x#ebjt`|AIlKL4x#|1Ew0f%A|2e`U{~-R^(ezW+8}Z~Omj z{6E^>pRJ#KeKL6Z#T$WaGb^WmwLVRH-}=q7OOk6kmy38IdsK(K>P zx>qw{*X$;6-E57d231=5dM+ZT*a@o648mEd3N4Em5BajTec=$8Z#d6=S8a-$?Q;bx zSWo zkTV}$$CPQr`fxmRb`|U}Zl$)9sn3_^>4OeKbr^KT;iI+nWZRkojf)ES@Nb#;psyDm7-?et@pwEQZ;EU6Qw2** zVsMG6kfUlEIx{R0mU4m;7Pw%kpq=bDFN4~pp@Og)@KX!HlDjFGZ=8%3dRaa8 zyjrsWu?DrAs4^JGu0Eqlod&e#xi0FvlE}PGi~5D6!_+T@r2CYcsL#$yUnOxl79iUZrerbvWlLVruJ0^!Nuu zT)&%1m7#MfabXOSy`o`&3!<`it~2GiSHsCH}w!dA>f5xOJ{=wvA3)Ps}UwkSh1n%k3UIEQhjKCp@NLsDftj@G8q;LJdL-%?9T?FmqC zv&HO1>8SPb=<#QBY(LVedd)SgOd^#rLTX=PhP3s|Y5WFz1UxxNb6m4Y-`@wrPd1X% z&UhS%g`mx=oW2ZHpre}H1f^~Bbau-WfXot^q3S0jTb?EM)9i;>C5cbzO@hKY%0NIEIk zk;iMk^(}dZoaNdlDWL6c3uk4f4CAN}YT0gxfE!ITDxj6NK^;M|{$cJMrt-aMhP59Q zB8zAUr;V|jUemM77LX3pMeBVt4A5_-U_*OI*Y?Ny{Xv2=`@ASlSg2_N zB^w(w%$)}nZfU148jPSjx{q*s|X(CLw+n{upGkkxT!#N}#;_g2*q}0P|^h4MwWl z>?7Pqd)0A%@T?wvuk^W1zUp;c&iW2&vQ)%~n%A^V(+HbVbrD-C#DS3ww5cr#cPGev zQ2U(H?|IWx4Ly`+N$Ap!t+aW@r=-KK(sbKO%3L#!zD?Xq>ECRjvH>lW6;?{CTJ5kj zpoN-}J+Qc15suk`(2(KJdX<0^8zV7r`yJ{ynuc=~YPc|;j}Ye=2#icI%r^}~UYcQT zN(`o}*~5ETn#@;I1rt_`1XZZeCl9CM&~gX7+NJ|;kq{2X7Px;l5uDiq4 z9GyZ_?g>e^E|Y?WSCC+~9sxV=OHiZ5jSxAQp#KeWcVQa#w# zI>S(D3RXXEr8*N&Trchqr6p%6r@{$0Rx2PQQ64iqo~K|emJh1s+*a81l#Mm2xY0Z6 zIqe60S`w{6cCFFq*po~aF=-fmR)`bfAJNI?B>be9McY$Da7^fgEw_?TX&8(Tr|}V7 z&_MGn%%F6ehdnp>@Y}S44tLAevhGUCvIs$Bu^nz!Cqa761E(jO)4Y2_G4R7J)LLeP z#}Ui0wA>Ciu4mw@OCB)$+!CtYb}-3xf#`wWT(+RT_ED^9b7=EGsDf#tCOh1xD-tz^> z_OgYmMlPvIJu$MYkxrP(*5_t@Ds*sx`qX5aT;YI}ek;j0L5L`$O6q)BOxvr+qS5#f zY532APKg6@%G~fOREP(&A`ls13I8Wv^i!7$Y#SwX z5xv|BB~?ynlGztJCm7?rIFwvHK@T7JLfYL<^Nr`wH0wDybWBQTDuU6~u7|OPGJllY z;L6L)p7~|eh73yBR>wU#TSPGxx4DT6+;GosDMg1y;Kw6=xSEuX*bG^$%EtdC_vf^D zr4Ibe^iaLU0!PPKVYz)M&Tn%8ZB9bma4*;zr((&s@z`-U6NB}`Nsz9LK@Y2F<7b9& zk6cQAybW}HPbCe?tfh&wwP;wybhrg4Q}{Sv9J#882rV%gT(*HD8tG<`m~(d@3e(ux z^w|y@ObVX|m&ZOhc{u|1ChiLfwuB}U2S5A_up(1>$@(AWf?r-E@_Nq|S0`fxaPk_>;c zLD29H`peFIF8jx5Ox)@~CswAxqBM+5TBpOly^=y!_+sk!o;bX2Jyq4mBiXNpMtv%T z`L-Bbznp|KJLECK?OkL#^ATmA(?wx{5iHugXuxg(u5Fa`_(V0se3JOrhGKI@|K~0 z$vDgzz8=EGi(!HdUGDo{q@FWoZ=~P%oBXNgw{%(eH>6*@<>IKKGWHjaVM^nE<&{4? r$pW$~-y$Ti(9|-Cem> literal 6925 zcmb_h2UHbD_oqn@}A%dS4og|Qp zOy-LcO+!h#5XTTA=VTMb#;u zAD%q)?>z42owY4Mc&9+DFP*Tefea8vKMa{Pm8CJ zDNx}V@nkavs>>oH`SJ15k;$?Age28TiBZvs(NPm(Ba>AHUl@A})RG0gy#?w4isBAL z@)Jc}?h_;GQH(;2bcn02IyaivT zgqwQul$1}0HrvI@%E*YTXO9m~N2d2@56E^7iwPm zK=cD2a5>`x%wZoGI^PE#sQN&$|73`FnG8Rk@rEzMy}{SZ8}e^>foiB11mt>x>P$~? zo;wLPSWSZZCmt|2!UML1c)*YSJ>X`lJE-(?2ipT~unpZn0oigoc2Lu+nuR44ya< zgjZZZHqQlyhq%B6)&5K_b>^cD)9};v&6I>og;5D3}`4qwr%?NkO5S~v#m<|XU zHV6?T5mfsjd{9F0JH zjb>nu76TVvIl$WM4p4B#0q(7LfW~DGU@NL^kq)3|>;TFe08D%VI)?*j^##}{15j~( zJQ$XZhvUWLK`UfD_;rpvf72fLSL~ttlsz~!+QaHPdx+U-4+=T2uX@t?J=xO!ziUsLFKtWbb8!y;F3#~duM*o_ne#o@T@hhnJ1)Q48kP*z_Q1}IFQ)oqxUl_7Dl1oYalD1sGK8EqZLTZb7!ixLqYaNSfCPLEqeU5~yrJh6EOoe2d7$mW+RATJ==@-6W9yjo5$QBdrIkJ6olR=T0_k{W1{Ikvpl-EqDY9u8d1_o?rp~gVB!k`T z7^OL+gPHW&EsqARETCs6wlO)sIgo7nDYk15kLn*?Ks`LgG@8p&p>jD}bWR2B?`KfV zlmcu@EFgtnH>3A~47yeM2)n!rXvO_AIJI{sg)P;fjm-tLT;GV?iiKqO#6!|&vz1Ep zfgMj{Rm{lDDv)Y}jx*CkjY!b`6WiTnMJ{`V)JO4vKRWjOJIid)-Z;LnRKIa0{ z;;TV5#{1X}YQ;=HS92OU>LgoWW>14xjHNLr8W~=tGYwi@#wHjZVLC0-@xirb)@R&W z+;U?C@n8E;{2Mnij1ND(u zw*zq05D)6@qJw)nQm8gB3)8M6Hd}s)cbqD4uW=@#@iAPiDWnJPH}HCPK3$EPL~Bxg zNXaaj9IXVze{>J?C!S+&j#8$#-<7hiCpV#7yDx4~?#9ZNAMo4t%b0U!)9IPwan?&U ziPq<@XIer>V3O&NtYftwzKTmF^&jkTv5e4+?vmrf@4lkJSINenq$-pZQl3-Ok3 zJ5zgV4+cNq&Ylf?f+4N-?7Mtjs$A}cZwqv&DEln4a+W!nYgDi~4y8rQO3xu z(x8wZPqIsObSPt89{yyggXgYxVe^G7WQ<-S4KTopLtjuoFMr%M&x^*K5207F5!9<_ z2JK!kg3@>Prk7@-aSAr37{jGZ!kS#VZC}Tl>aM2TT_wya<0{0{Ke4K}n(;=12C3-3 z!(D;aRFtnx9Z`cwcUTDy?zX1}wX?Y6^%dMs>h$%R{?zf*gl?VDpv6%Y?8R@b=>4(D z=vxp>VKT>Xy+*q7$~I-T6U9!s7!nG~GlMb9jDD8NJM(zt63W;cEPfR9?sra5TteQmIuD7!7-c#syRVH2A)hL;7 zM%2sG)qP52k)=ytrjI6_b>oOR=s>@ncEB5pqR3=+A*w&!$7)7*FjF#&vGKwV_QVTE zeEWSle&=5{cTKH$#U{*sA7$lhtgsB9MsNROo11Vh{?C5Sb?A{aHbjXU#7wXY_){DB>1B~nX1te5&WgXZIavD{RK`D*Q+BIiT z{mpjPxBUuE8>fqn1^p>mGYhLTAL7e)H=1!?kBZA;X^h6qa2zdvfpl&05&XGiGAa1|f+lMfF}u4z ztt(%Fc5f}{P3-`>V3|VOzZy@a&dVsx=?PnMr-c!QIpMHF+t~Llqe$~{8(9A?tYqw z3+La*P^IITvoQ*nj4sAoruCRg=WxAm9}2Vgp<$cHl74>y**xWA!|MSQWxNjEErt>Q zKmyJRzJ#^4MYtvY73St2#hD&a^vzX8dUtRUl_b|NimA$^5!B44C25gP=o8#ac6f25 z9{C5aL8Cf-`YLZFE;%!S%9kc2n zsM}+4!M3aH`}Sx$o@IlX;fa*!yA`z#4xs&eny@+7fHbvs;d~8U+PC*D?uoFagVhF9 zb*_Oe6j+JYBK^?vatOU^ABbIPOQ^{|8x4L7B-PBzczEUvDyn~l_4E~K4j)aP=N(C9 zo-3)}%g3Rc6zOr{C2YMllseNB$W&ezyE?LIh|MyrH{QXhcRJ#dt*vZ%KoTC7+k-Y2 z-O17AEtZ?jpx2YKamM1ARKeeYDJ^qpnf_0x9(01;HbR|_4EMu~p>gD}zYV*)=1@!F zT?`%Y42$`Cl+a6$%%0g&MZ5!Loz20tfziZQY{b`@r`V2nX;`VT0M+Aa@Ry2k8hHVz z&zdBvT+O4jhmY}WZ7aT-s!O*!<*0M896C*Zfcaf3&}?X5^2zT_VG%DG)mVLUjjm>m z(}&WHOX_r}$dVqOA41`d1|<8^ojU3*DPIso_V*RY*yJX&*Le_q=dy!MRUd&-S`BPt z=6bx8?ty+gL#dbiulTlhF*Wm5>C6ft@gF-%)`y9_(dblkhqYA8$IFkcab(L5e4aat zHeXnQPRc@>=Wzl(JSSjn_HIka}FJQ=?F9=p2x(Z)LmaJRP+>KxRi_^M*e z6^x@>n}_4wresw=vlJ&rxVPRDAAq#Kpi>A!=JFsbZB6*#^ft5jS%d60PjT_ltyMPf7W9j5MbLu@m4-NPE(xk6W z;6szXcq;B8njG`TQbvJ{6rAvEVSidMX(wLMwWJMRBgka%OlltNPIrf8Q26z|I4iXl zud2Mqi+L{)^UUbY96+_3zO?pGA=(QX*#`+VtZKk)?EKmTOSVp;+>nP@bT5ToPU%Z& zCMGo2^=mR=?a1+Wr1{PMq`oJG>NLNhmZoKtqg0AR`E974{}2~u$&sx49UPxKm0aT# zX!7Ye67PSUy9_NCsmaKQYtQ?iJ~H%a|C1V>m?SzIa6*UwVaxN`PRLSK+=Gsi>ArHJ zzi#5{$&(esr!$qm{iH~~zET;$ksr1Lw^5iKhj{1Y4CqMcVSA6g! pE*u7m21Z=G2Y*CSp7^LqrcC>zEe?C~dWlbeL^-`=#Qcxk{{ZqFpkx35 diff --git a/test/expect/ModelTester.test_ssd512_resnet50_expect.pkl b/test/expect/ModelTester.test_ssd512_resnet50_expect.pkl index 2d79f3168e3d48c3ae76b0395e1da632eeea5ca1..160e49d09b1e4dfcceb2d86a5151102c251e5c23 100644 GIT binary patch literal 6925 zcmeI1eN*^XuM^0Nh7bt2%8F^iAd5m94P3-w_Oz*;buC$YF<%gAbzOD$KELmMWM+!&^v~|uMnL9M;aTB9wfHW$^FS&Iwo=E@pdbwOQ%R?@`A=6fp* zYLuFVrDn+kO|+R-TcQ1DSgDmf^8=JRy2SXzDM>orRNd6%%|ubYx21zNlf@7krH9D}c6M3WS3&^AfKS6W!E z5q9bJTq%TdSqqEI)*5Lfa;YQAwpEti98jq9fD(1ODN6b%_ zDXO+`lF{JplU2Vjt;|gj*kT-X$3+PKPCG8nWdbwPeFt`;G6fDr7C#yPb~;-Tawi!5 zaInan^PHJhf9&^2*}A_nU~Q!?)$knJdew51iMpOeP0q(RWV_IVg)6o1(}; zSxg!|ot$ltEz9o-PY=@M(r>KeJw$xj`9`PuwFhz>>hiwvHk@e7@_D`GPwhFk_IZE%9|;+(Grrs>(JL0R08C6jZ;m&By9I4Fy|vggqF_KS}x zbeumvXM*tCb**X)@f8!6kZ0{)PHkB}US1iCeG0-tTZ!y5mz66Ip z@Vd^7*jr26Vu|9tP<#&zuQ~fVgVg^`yR%G$x?$4EN$YSg7!M(}s7?j1&Qos%60pwS#;LP=- z$v;)Ob3E2-?*w)Cuxp|$KCeTmzoU~=hE^4^bb)yz z*)>fTo5Db2Uc{c9{8#T!kzt%9BfG~a_nI=$5MM}c(18bq67oEE zaQDX}$^Xy;u1pzAzB&VR9j|oAO7Z*N13R|QBmZazxc+J_`Byi<3Db(mtLB5^bEs!h z+XY_x1;wXb2jzSV`P%tlxZmUCJN&@}0oUUrK451D`mta;_}<%dseWKL_~VU_lBaC~ zuN})Kf8heScuyqx#>-&Q#V9(yH(vrfe*PH67v+I(oLxljdk)mrJW2i+JGeEziaam{ zw6=hA*MJFH)H@IWHlJ;z`n`0omqlYIyzv8YcPN-M3XJG`gzBHZ1ez=h$#<%330_U! zqjF?Q4f)(`aNQ?3ZceIv;>*!A-iCAFxXpMD%2cUYRzdBz)q~rQt|C9Ca%m9y_118( zF&Eo^;~{Xz$yTcWqc6axPpu_i-2>`RU>sI9RFy_B(9+Zz1R5*G6IRDf8jeY%KWr#!bFmxRJ|C|Tx{xw2stJNzX6WQ_u z?qBLx#vgY7Qtq0{_o<;}Xby&sfp0qo*!h{Z5rk)N7n;CdYYWfAtKM-q3+e>fSIKyu2o= tH6k_qf$y8Qn%U|zeQp>kEA7}{O~Olez**R6@MQn~D35EN0;`X@{|3cf`nv!C literal 6925 zcmeI1O>7%Q6vrpuiTjg~*X@Bcn$*1Mh! zy1JF>?^g|DyBbu5T5+koS{h$0)C*(R-l@#wwyHN$<3{IOp}HP+jL!E&2$!qxmzKlM zV!awEjxDUz%a!HKLalUprCeEDsF$uTS8EHalbNtXtj84CG2~7q^lnGUkaWHhJx-};Xp7N zbleIvw~NJUrBW)^%hjvPnOACyrCMq6V!2q)gqxn73pdxpEpy@4`Cf6LSgl1aZ(E8y zTIybMEly0HotzH0FZBh#)WaQ@f`MS@W;hsRZiYK=FA3IAJ=`@H?vAl4h0CSNayYE9 zOhhkLuU=UXXpa-n#8fsr9q!fe`waYkfj@93hCf&j56y*#R|;dZL0|um7v6ka?odju z*`|d%ptht6H|`{GCO422@r%2k=apKst|%$-T+_pkuE)Hi|9oa-lzRNrx#(W0Q$L@} z-&-wPx`LzBqc4Z!HBFWJ{oTKEvOfN)rgDG$&ncsq#>@P1KUdp-GXL=J`?+sVoQ&~A zUJs4`;Dsac`or^@!o~055AMeN>5nyK-A8@#HTtXV`t-%N8?T)o;lFomJoX1FYkI8Z z-FV%($hC^smw(&Oc6`Ba%$Kan&~f|tcKx($BJbwI&8N-x_aBIBd6)02mz!_2@!#hA z@@w-G-6S5H@6%6obiSxF^uO|3)myOs58vMpc(~@>&u}H5e*N_RXzZ8P{K4NY#`^=> zPT%$?xtBZau*%$}X+Cy^y8Y$OMbqtP(6I0F+x5F`zg@q(d~G|4&*!h}pWV-izF=q^ zoA2v)o1b9I-Nw!{SKp3bP+UIy2r4pq5O*^7`Ab|r|I#=c#Y^mQ*X%s>`Qz?icMc|g zyL|Ehb@}b|-F&yw_wV=g^V_wD-w8!R=hw9l*C+3@PuxO%e0#t7_=z3uPrLJ`k&nmC z^wari)E7N2-}6`F;%B3N8Xs?vx8qNEa;I_I=Ra*f!Eo1^BeBuB$Z`44Q}&bZJahfG z&s$gCor|Zf^oa!f{_{4;zl-nA{rbe-`1?le+x7G5+x_qIH)fwGsaKlX_%5E-Y%cHm zFLU=iPinuGm)Dx_?|gFO#%87DYV}%ky4h<@{FXM8n@nD7Qtzm(CXvLap4bfj~hMYT<#WQ#b682l9nJd1ZewCyyfr0DSt1dXuj$Mvnb} zeey#5gGP>Z38bRc>+Fl03ScFL!8u+z90|Gi3fklAAXQ0?6U7UC)~ck{#FwZ)b$@+ z{2>42g?)g(^gHoTH|jvX&?7J4%k$}*x2L|o5CO^6{Q0y^<=3B2tEJj<^xwct?!4!b z)avJA18*uL9{l>%mDf6>%*ZtxbO>4hPsRRdPZfl(hx-qUt8M=_W{+siAEHv(+U{q5 z7s-B#r;+YP#cYBU|FU+G?0lz@ve7ngh9tiMTqHYhR9r{*AbOC4WU4hA^!Cy=hq~AZ kN^!(NUpGBGXP%jnUcB4BGkZbUvq4YXc|DP~$ZOeu0rDOd5C8xG diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index a5c76c0b355..c6fe8856a01 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -176,7 +176,6 @@ def __init__(self, in_channels, num_anchors): torch.nn.init.zeros_(layer.bias) self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) - self._l1_loss = torch.nn.functional.l1_loss def compute_loss(self, targets, head_outputs, anchors, matched_idxs): # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor], List[Tensor]) -> Tensor @@ -199,7 +198,7 @@ def compute_loss(self, targets, head_outputs, anchors, matched_idxs): target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image) # compute the loss - losses.append(self._l1_loss( + losses.append(torch.nn.functional.l1_loss( bbox_regression_per_image, target_regression, reduction='sum' @@ -456,15 +455,6 @@ def postprocess_detections(self, head_outputs, anchors, image_shapes): return detections - def _anchors_per_level(self, features: List[Tensor], HWA: int): - # recover level sizes - num_anchors_per_level = [x.size(2) * x.size(3) for x in features] - HW = 0 - for v in num_anchors_per_level: - HW += v - A = HWA // HW - return [hw * A for hw in num_anchors_per_level] - def forward(self, images, targets=None): # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]] """ @@ -542,7 +532,13 @@ def forward(self, images, targets=None): losses = self.compute_loss(targets, head_outputs, anchors) else: # recover level sizes - num_anchors_per_level = self._anchors_per_level(features, head_outputs['cls_logits'].size(1)) + num_anchors_per_level = [x.size(2) * x.size(3) for x in features] + HW = 0 + for v in num_anchors_per_level: + HW += v + HWA = head_outputs['cls_logits'].size(1) + A = HWA // HW + num_anchors_per_level = [hw * A for hw in num_anchors_per_level] # split outputs per level split_head_outputs: Dict[str, List[Tensor]] = {} diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index ddbf8af3a5d..a7643fd3a3c 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -1,10 +1,10 @@ -import math import torch import torch.nn.functional as F +import warnings from collections import OrderedDict from torch import nn, Tensor -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple from . import _utils as det_utils from .anchor_utils import DBoxGenerator @@ -12,8 +12,7 @@ from .transform import GeneralizedRCNNTransform from .. import vgg, resnet from ..utils import load_state_dict_from_url - -from .retinanet import RetinaNet, RetinaNetHead, RetinaNetRegressionHead, _sum # TODO: Refactor to inherit properly +from ...ops import boxes as box_ops __all__ = ['SSD', 'SSDFeatureExtractor', 'ssd300_vgg16', 'ssd512_resnet50'] @@ -24,25 +23,46 @@ } -def _xavier_init(conv: nn.Module, bias_value: float = 0.0): +def _sum(x: List[Tensor]) -> Tensor: + res = x[0] + for i in x[1:]: + res = res + i + return res + + +def _xavier_init(conv: nn.Module): for layer in conv.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.xavier_uniform_(layer.weight) if layer.bias is not None: - torch.nn.init.constant_(layer.bias, bias_value) + torch.nn.init.constant_(layer.bias, 0.0) -class SSDHead(RetinaNetHead): +class SSDHead(nn.Module): def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int, positive_fraction: float, box_coder: det_utils.BoxCoder): - nn.Module.__init__(self) + super().__init__() self.classification_head = SSDClassificationHead(in_channels, num_anchors, num_classes, positive_fraction) self.regression_head = SSDRegressionHead(in_channels, num_anchors, box_coder) + def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], anchors: List[Tensor], + matched_idxs: List[Tensor]) -> Dict[str, Tensor]: + return { + 'bbox_regression': self.regression_head.compute_loss(targets, head_outputs['bbox_regression'], anchors, + matched_idxs), + 'classification': self.classification_head.compute_loss(targets, head_outputs['cls_logits'], matched_idxs), + } + + def forward(self, x: List[Tensor]) -> Dict[str, Tensor]: + return { + 'bbox_regression': self.regression_head(x), + 'cls_logits': self.classification_head(x), + } + class SSDScoringHead(nn.Module): def __init__(self, module_list: nn.ModuleList, num_columns: int): - nn.Module.__init__(self) + super().__init__() self.module_list = module_list self.num_columns = num_columns @@ -80,54 +100,87 @@ def forward(self, x: List[Tensor]) -> Tensor: class SSDClassificationHead(SSDScoringHead): - def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int, positive_fraction: float, - prior_probability: float = 0.01): + def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int, positive_fraction: float): cls_logits = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): cls_logits.append(nn.Conv2d(channels, num_classes * anchors, kernel_size=3, padding=1)) - _xavier_init(cls_logits, -math.log((1 - prior_probability) / prior_probability)) super().__init__(cls_logits, num_classes) self.neg_to_pos_ratio = (1.0 - positive_fraction) / positive_fraction - def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], - matched_idxs: List[Tensor]) -> Tensor: - losses = [] - - cls_logits = head_outputs['cls_logits'] - + def compute_loss(self, targets: List[Dict[str, Tensor]], cls_logits: Tensor, matched_idxs: List[Tensor]) -> Tensor: + # Match original targets with anchors + cls_targets = [] for targets_per_image, cls_logits_per_image, matched_idxs_per_image in zip(targets, cls_logits, matched_idxs): foreground_idxs_per_image = matched_idxs_per_image >= 0 - num_foreground = foreground_idxs_per_image.sum() gt_classes_target = torch.zeros((cls_logits_per_image.size(0), ), dtype=targets_per_image['labels'].dtype, device=targets_per_image['labels'].device) gt_classes_target[foreground_idxs_per_image] = \ targets_per_image['labels'][matched_idxs_per_image[foreground_idxs_per_image]] - classification_loss = F.cross_entropy(cls_logits_per_image, gt_classes_target, reduction='none') - # Hard Negative Sampling - background_idxs_per_image = torch.logical_not(foreground_idxs_per_image) - num_background = matched_idxs_per_image.size(0) - num_foreground - num_negative = torch.min((self.neg_to_pos_ratio * num_foreground).to(dtype=num_background.dtype), - num_background) + cls_targets.append(gt_classes_target) - foreground_loss = classification_loss[foreground_idxs_per_image] - background_loss = classification_loss[background_idxs_per_image].topk(num_negative, sorted=False)[0] + cls_targets = torch.stack(cls_targets) - losses.append((foreground_loss.sum() + background_loss.sum()) / max(1, num_foreground)) + # Calculate loss + num_classes = cls_logits.size(-1) + cls_loss = F.cross_entropy( + cls_logits.view(-1, num_classes), + cls_targets.view(-1), + reduction='none' + ).view(cls_targets.size()) - return _sum(losses) / len(targets) + # Hard Negative Sampling + foreground_idxs = cls_targets > 0 + num_negative = self.neg_to_pos_ratio * foreground_idxs.sum(1, keepdim=True) + num_negative[num_negative < self.neg_to_pos_ratio] = self.neg_to_pos_ratio + negative_loss = cls_loss.clone() + negative_loss[foreground_idxs] = -float('inf') # use -inf to detect positive values that creeped in the sample + values, idx = negative_loss.sort(1, descending=True) + background_idxs = torch.logical_and(idx.sort(1)[1] < num_negative, torch.isfinite(values)) + loss = (cls_loss[foreground_idxs].sum() + cls_loss[background_idxs].sum()) / max(1, foreground_idxs.sum()) + return loss + + +class SSDRegressionHead(SSDScoringHead): + + __annotations__ = { + 'box_coder': det_utils.BoxCoder, + } -class SSDRegressionHead(SSDScoringHead, RetinaNetRegressionHead): # TODO: Refactor to avoid multiple inheritance def __init__(self, in_channels: List[int], num_anchors: List[int], box_coder: det_utils.BoxCoder): bbox_reg = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): bbox_reg.append(nn.Conv2d(channels, 4 * anchors, kernel_size=3, padding=1)) - _xavier_init(bbox_reg) - SSDScoringHead.__init__(self, bbox_reg, 4) + super().__init__(bbox_reg, 4) self.box_coder = box_coder - self._l1_loss = torch.nn.functional.smooth_l1_loss # TODO: Discuss/refactor this workaround + + def compute_loss(self, targets: List[Dict[str, Tensor]], bbox_regression: Tensor, anchors: List[Tensor], + matched_idxs: List[Tensor]) -> Tensor: + losses = [] + for targets_per_image, bbox_regression_per_image, anchors_per_image, matched_idxs_per_image in \ + zip(targets, bbox_regression, anchors, matched_idxs): + # determine only the foreground indices, ignore the rest + foreground_idxs_per_image = torch.where(matched_idxs_per_image >= 0)[0] + num_foreground = foreground_idxs_per_image.numel() + + # select only the foreground boxes + matched_gt_boxes_per_image = targets_per_image['boxes'][matched_idxs_per_image[foreground_idxs_per_image]] + bbox_regression_per_image = bbox_regression_per_image[foreground_idxs_per_image, :] + anchors_per_image = anchors_per_image[foreground_idxs_per_image, :] + + # compute the regression targets + target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image) + + # compute the loss + losses.append(torch.nn.functional.smooth_l1_loss( + bbox_regression_per_image, + target_regression, + reduction='sum' + ) / max(1, num_foreground)) + + return _sum(losses) / max(1, len(targets)) class SSDFeatureExtractor(nn.Module): @@ -136,7 +189,12 @@ def __init__(self, aspect_ratios: List[List[int]]): self.aspect_ratios = aspect_ratios -class SSD(RetinaNet): +class SSD(nn.Module): + __annotations__ = { + 'box_coder': det_utils.BoxCoder, + 'proposal_matcher': det_utils.Matcher, + } + def __init__(self, backbone: SSDFeatureExtractor, size: int, num_classes: int, image_mean: Optional[List[float]] = None, image_std: Optional[List[float]] = None, score_thresh: float = 0.01, @@ -145,7 +203,7 @@ def __init__(self, backbone: SSDFeatureExtractor, size: int, num_classes: int, iou_thresh: float = 0.5, topk_candidates: int = 400, positive_fraction: float = 0.25): - nn.Module.__init__(self) + super().__init__() # Use dummy data to retrieve the feature map sizes to avoid hard-coding their values device = next(backbone.parameters()).device @@ -183,14 +241,142 @@ def __init__(self, backbone: SSDFeatureExtractor, size: int, num_classes: int, # used only on torchscript mode self._has_warned = False - def _anchors_per_level(self, features: List[Tensor], HWA: int): - # TODO: Discuss/refactor this workaround - num_anchors_per_level = [x.size(2) * x.size(3) * anchors for x, anchors in zip(features, self.num_anchors)] - HW = 0 - for v in num_anchors_per_level: - HW += v - A = HWA // HW - return [hw * A for hw in num_anchors_per_level] + @torch.jit.unused + def eager_outputs(self, losses: Dict[str, Tensor], + detections: List[Dict[str, Tensor]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]: + if self.training: + return losses + + return detections + + def forward(self, images: List[Tensor], + targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]: + if self.training and targets is None: + raise ValueError("In training mode, targets should be passed") + + if self.training: + assert targets is not None + for target in targets: + boxes = target["boxes"] + if isinstance(boxes, torch.Tensor): + if len(boxes.shape) != 2 or boxes.shape[-1] != 4: + raise ValueError("Expected target boxes to be a tensor" + "of shape [N, 4], got {:}.".format( + boxes.shape)) + else: + raise ValueError("Expected target boxes to be of type " + "Tensor, got {:}.".format(type(boxes))) + + # get the original image sizes + original_image_sizes: List[Tuple[int, int]] = [] + for img in images: + val = img.shape[-2:] + assert len(val) == 2 + original_image_sizes.append((val[0], val[1])) + + # transform the input + images, targets = self.transform(images, targets) + + # Check for degenerate boxes + if targets is not None: + for target_idx, target in enumerate(targets): + boxes = target["boxes"] + degenerate_boxes = boxes[:, 2:] <= boxes[:, :2] + if degenerate_boxes.any(): + # print the first degenerate box + bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0] + degen_bb: List[float] = boxes[bb_idx].tolist() + raise ValueError("All bounding boxes should have positive height and width." + " Found invalid box {} for target at index {}." + .format(degen_bb, target_idx)) + + # get the features from the backbone + features = self.backbone(images.tensors) + if isinstance(features, torch.Tensor): + features = OrderedDict([('0', features)]) + + features = list(features.values()) + + # compute the retinanet heads outputs using the features + head_outputs = self.head(features) + + # create the set of anchors + anchors = self.anchor_generator(images, features) + + losses = {} + detections: List[Dict[str, Tensor]] = [] + if self.training: + assert targets is not None + + matched_idxs = [] + for anchors_per_image, targets_per_image in zip(anchors, targets): + if targets_per_image['boxes'].numel() == 0: + matched_idxs.append(torch.full((anchors_per_image.size(0),), -1, dtype=torch.int64, + device=anchors_per_image.device)) + continue + + match_quality_matrix = box_ops.box_iou(targets_per_image['boxes'], anchors_per_image) + matched_idxs.append(self.proposal_matcher(match_quality_matrix)) + + losses = self.head.compute_loss(targets, head_outputs, anchors, matched_idxs) + else: + detections = self.postprocess_detections(head_outputs, anchors, images.image_sizes) + detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes) + + if torch.jit.is_scripting(): + if not self._has_warned: + warnings.warn("RetinaNet always returns a (Losses, Detections) tuple in scripting") + self._has_warned = True + return losses, detections + return self.eager_outputs(losses, detections) + + def postprocess_detections(self, head_outputs: Dict[str, Tensor], image_anchors: List[Tensor], + image_shapes: List[Tuple[int, int]]) -> List[Dict[str, Tensor]]: + bbox_regression = head_outputs['bbox_regression'] + pred_scores = F.softmax(head_outputs['cls_logits'], dim=-1) + + num_classes = pred_scores.size(-1) + device = pred_scores.device + + detections: List[Dict[str, Tensor]] = [] + + for boxes, scores, anchors, image_shape in zip(bbox_regression, pred_scores, image_anchors, image_shapes): + boxes = self.box_coder.decode_single(boxes, anchors) + boxes = box_ops.clip_boxes_to_image(boxes, image_shape) + + image_boxes = [] + image_scores = [] + image_labels = [] + for label in range(1, num_classes): + score = scores[:, label] + + keep_idxs = score > self.score_thresh + score = score[keep_idxs] + box = boxes[keep_idxs] + + # keep only topk scoring predictions + num_topk = min(self.topk_candidates, score.size(0)) + score, idxs = score.topk(num_topk) + box = box[idxs] + + image_boxes.append(box) + image_scores.append(score) + image_labels.append(torch.full_like(score, fill_value=label, dtype=torch.int64, device=device)) + + image_boxes = torch.cat(image_boxes, dim=0) + image_scores = torch.cat(image_scores, dim=0) + image_labels = torch.cat(image_labels, dim=0) + + # non-maximum suppression + keep = box_ops.batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh) + keep = keep[:self.detections_per_img] + + detections.append({ + 'boxes': image_boxes[keep], + 'scores': image_scores[keep], + 'labels': image_labels[keep], + }) + return detections class SSDFeatureExtractorVGG(SSDFeatureExtractor): From dc5b7d5f74549fffdf726526546d0d6f69362768 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 16 Apr 2021 23:01:43 +0100 Subject: [PATCH 53/92] SSD should fix the input size. --- torchvision/models/detection/ssd.py | 12 +++---- torchvision/models/detection/transform.py | 42 +++++++++++++---------- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index a7643fd3a3c..321468f063e 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -195,7 +195,7 @@ class SSD(nn.Module): 'proposal_matcher': det_utils.Matcher, } - def __init__(self, backbone: SSDFeatureExtractor, size: int, num_classes: int, + def __init__(self, backbone: SSDFeatureExtractor, size: Tuple[int, int], num_classes: int, image_mean: Optional[List[float]] = None, image_std: Optional[List[float]] = None, score_thresh: float = 0.01, nms_thresh: float = 0.45, @@ -207,7 +207,7 @@ def __init__(self, backbone: SSDFeatureExtractor, size: int, num_classes: int, # Use dummy data to retrieve the feature map sizes to avoid hard-coding their values device = next(backbone.parameters()).device - tmp_img = torch.zeros((1, 3, size, size), device=device) + tmp_img = torch.zeros((1, 3, size[1], size[0]), device=device) tmp_sizes = [x.size() for x in backbone(tmp_img).values()] out_channels = [x[1] for x in tmp_sizes] @@ -229,9 +229,9 @@ def __init__(self, backbone: SSDFeatureExtractor, size: int, num_classes: int, image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] - self.transform = GeneralizedRCNNTransform(size, size, image_mean, image_std, + self.transform = GeneralizedRCNNTransform(min(size), max(size), image_mean, image_std, # TODO: Discuss/refactor these workarounds - size_divisible=1, exceed_max_size=True) + size_divisible=1, fixed_size=size) self.score_thresh = score_thresh self.nms_thresh = nms_thresh @@ -492,7 +492,7 @@ def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: i pretrained_backbone = False backbone = _vgg_extractor("vgg16", False, pretrained_backbone, trainable_backbone_layers) - model = SSD(backbone, 300, num_classes, **kwargs) + model = SSD(backbone, (300, 300), num_classes, **kwargs) if pretrained: weights_name = 'ssd300_vgg16_coco' if model_urls.get(weights_name, None) is None: @@ -604,7 +604,7 @@ def ssd512_resnet50(pretrained: bool = False, progress: bool = True, num_classes pretrained_backbone = False backbone = _resnet_extractor("resnet50", pretrained_backbone, trainable_backbone_layers) - model = SSD(backbone, 512, num_classes, **kwargs) + model = SSD(backbone, (512, 512), num_classes, **kwargs) if pretrained: weights_name = 'ssd512_resnet50_coco' if model_urls.get(weights_name, None) is None: diff --git a/torchvision/models/detection/transform.py b/torchvision/models/detection/transform.py index 0b616041e39..3218380699b 100644 --- a/torchvision/models/detection/transform.py +++ b/torchvision/models/detection/transform.py @@ -23,34 +23,40 @@ def _fake_cast_onnx(v): return v -def _resize_image_and_masks(image, self_min_size, self_max_size, target, exceed_max_size): - # type: (Tensor, float, float, Optional[Dict[str, Tensor]], bool) -> Tuple[Tensor, Optional[Dict[str, Tensor]]] +def _resize_image_and_masks(image: Tensor, self_min_size: float, self_max_size: float, + target: Optional[Dict[str, Tensor]], + fixed_size: Optional[Tuple[int, int]]) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: if torchvision._is_tracing(): im_shape = _get_shape_onnx(image) else: im_shape = torch.tensor(image.shape[-2:]) - min_size = torch.min(im_shape).to(dtype=torch.float32) - max_size = torch.max(im_shape).to(dtype=torch.float32) - scale = self_min_size / min_size - if not exceed_max_size: - scale = torch.min(scale, self_max_size / max_size) - - if torchvision._is_tracing(): - scale_factor = _fake_cast_onnx(scale) + size: Optional[List[int]] = None + scale_factor: Optional[float] = None + recompute_scale_factor: Optional[bool] = None + if fixed_size is not None: + size = [fixed_size[1], fixed_size[0]] else: - scale_factor = scale.item() + min_size = torch.min(im_shape).to(dtype=torch.float32) + max_size = torch.max(im_shape).to(dtype=torch.float32) + scale = torch.min(self_min_size / min_size, self_max_size / max_size) + + if torchvision._is_tracing(): + scale_factor = _fake_cast_onnx(scale) + else: + scale_factor = scale.item() + recompute_scale_factor = True - image = torch.nn.functional.interpolate( - image[None], scale_factor=scale_factor, mode='bilinear', recompute_scale_factor=True, - align_corners=False)[0] + image = torch.nn.functional.interpolate(image[None], size=size, scale_factor=scale_factor, mode='bilinear', + recompute_scale_factor=recompute_scale_factor, align_corners=False)[0] if target is None: return image, target if "masks" in target: mask = target["masks"] - mask = F.interpolate(mask[:, None].float(), scale_factor=scale_factor, recompute_scale_factor=True)[:, 0].byte() + mask = F.interpolate(mask[:, None].float(), size=size, scale_factor=scale_factor, + recompute_scale_factor=recompute_scale_factor)[:, 0].byte() target["masks"] = mask return image, target @@ -67,7 +73,7 @@ class GeneralizedRCNNTransform(nn.Module): It returns a ImageList for the inputs, and a List[Dict[Tensor]] for the targets """ - def __init__(self, min_size, max_size, image_mean, image_std, size_divisible=32, exceed_max_size=False): + def __init__(self, min_size, max_size, image_mean, image_std, size_divisible=32, fixed_size=None): super(GeneralizedRCNNTransform, self).__init__() if not isinstance(min_size, (list, tuple)): min_size = (min_size,) @@ -76,7 +82,7 @@ def __init__(self, min_size, max_size, image_mean, image_std, size_divisible=32, self.image_mean = image_mean self.image_std = image_std self.size_divisible = size_divisible - self.exceed_max_size = exceed_max_size + self.fixed_size = fixed_size def forward(self, images, # type: List[Tensor] @@ -148,7 +154,7 @@ def resize(self, image, target): else: # FIXME assume for now that testing uses the largest scale size = float(self.min_size[-1]) - image, target = _resize_image_and_masks(image, size, float(self.max_size), target, self.exceed_max_size) + image, target = _resize_image_and_masks(image, size, float(self.max_size), target, self.fixed_size) if target is None: return image, target From 82f8ddbac1425e88b1f28ed238353f8507eef5d6 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Sat, 17 Apr 2021 01:54:26 +0100 Subject: [PATCH 54/92] Fixing messages and comments. --- torchvision/models/detection/ssd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 321468f063e..48c6c75b25e 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -297,7 +297,7 @@ def forward(self, images: List[Tensor], features = list(features.values()) - # compute the retinanet heads outputs using the features + # compute the ssd heads outputs using the features head_outputs = self.head(features) # create the set of anchors @@ -325,7 +325,7 @@ def forward(self, images: List[Tensor], if torch.jit.is_scripting(): if not self._has_warned: - warnings.warn("RetinaNet always returns a (Losses, Detections) tuple in scripting") + warnings.warn("SSD always returns a (Losses, Detections) tuple in scripting") self._has_warned = True return losses, detections return self.eager_outputs(losses, detections) From 2cbd58d3c09458f3cf7562e673eb6f2a90822998 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Sat, 17 Apr 2021 15:52:22 +0100 Subject: [PATCH 55/92] Silently ignoring exception if test-only. --- references/detection/train.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/references/detection/train.py b/references/detection/train.py index 83fad36d2cc..491c4c295b7 100644 --- a/references/detection/train.py +++ b/references/detection/train.py @@ -113,9 +113,15 @@ def main(args): if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) - optimizer.load_state_dict(checkpoint['optimizer']) - lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) - args.start_epoch = checkpoint['epoch'] + 1 + try: + optimizer.load_state_dict(checkpoint['optimizer']) + lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) + args.start_epoch = checkpoint['epoch'] + 1 + except Exception as e: + if args.test_only: + pass + else: + raise e if args.test_only: evaluate(model, data_loader_test, device=device) From 52940d484dba784533af21e836953031185660c4 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Sun, 18 Apr 2021 18:33:11 +0100 Subject: [PATCH 56/92] Update comments. --- torchvision/models/detection/ssd.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 48c6c75b25e..2ebe9c486bb 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -104,11 +104,12 @@ def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: cls_logits = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): cls_logits.append(nn.Conv2d(channels, num_classes * anchors, kernel_size=3, padding=1)) + # _xavier_init(cls_logits) super().__init__(cls_logits, num_classes) self.neg_to_pos_ratio = (1.0 - positive_fraction) / positive_fraction def compute_loss(self, targets: List[Dict[str, Tensor]], cls_logits: Tensor, matched_idxs: List[Tensor]) -> Tensor: - # Match original targets with anchors + # Match original targets with default boxes cls_targets = [] for targets_per_image, cls_logits_per_image, matched_idxs_per_image in zip(targets, cls_logits, matched_idxs): foreground_idxs_per_image = matched_idxs_per_image >= 0 @@ -153,6 +154,7 @@ def __init__(self, in_channels: List[int], num_anchors: List[int], box_coder: de bbox_reg = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): bbox_reg.append(nn.Conv2d(channels, 4 * anchors, kernel_size=3, padding=1)) + # _xavier_init(bbox_reg) super().__init__(bbox_reg, 4) self.box_coder = box_coder @@ -283,7 +285,6 @@ def forward(self, images: List[Tensor], boxes = target["boxes"] degenerate_boxes = boxes[:, 2:] <= boxes[:, :2] if degenerate_boxes.any(): - # print the first degenerate box bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0] degen_bb: List[float] = boxes[bb_idx].tolist() raise ValueError("All bounding boxes should have positive height and width." From db432f637810a26570ab2c82c923a9b672332af9 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 19 Apr 2021 12:12:53 +0100 Subject: [PATCH 57/92] Update regression loss. --- torchvision/models/detection/ssd.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 2ebe9c486bb..1a69c9b3ec4 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -160,6 +160,7 @@ def __init__(self, in_channels: List[int], num_anchors: List[int], box_coder: de def compute_loss(self, targets: List[Dict[str, Tensor]], bbox_regression: Tensor, anchors: List[Tensor], matched_idxs: List[Tensor]) -> Tensor: + N = 0 losses = [] for targets_per_image, bbox_regression_per_image, anchors_per_image, matched_idxs_per_image in \ zip(targets, bbox_regression, anchors, matched_idxs): @@ -176,13 +177,14 @@ def compute_loss(self, targets: List[Dict[str, Tensor]], bbox_regression: Tensor target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image) # compute the loss + N += num_foreground losses.append(torch.nn.functional.smooth_l1_loss( bbox_regression_per_image, target_regression, reduction='sum' - ) / max(1, num_foreground)) + )) - return _sum(losses) / max(1, len(targets)) + return _sum(losses) / max(1, N) class SSDFeatureExtractor(nn.Module): From ff6ba4a5e077c085b640592c646d1fb347edb140 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 22 Apr 2021 10:19:23 +0100 Subject: [PATCH 58/92] Restore Xavier init everywhere, update the negative sampling method, change the clipping approach. --- torchvision/models/detection/anchor_utils.py | 19 +++++++++++-------- torchvision/models/detection/ssd.py | 9 +++++---- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index f50fd0ea336..bb80a9b8b54 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -163,9 +163,10 @@ def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Ten class DBoxGenerator(nn.Module): def __init__(self, aspect_ratios: List[List[int]], min_ratio: float = 0.15, max_ratio: float = 0.9, - clip: bool = False): + clip: bool = True): super().__init__() self.aspect_ratios = aspect_ratios + self.clip = clip num_outputs = len(aspect_ratios) # Estimation of default boxes scales @@ -180,18 +181,17 @@ def __init__(self, aspect_ratios: List[List[int]], min_ratio: float = 0.15, max_ self.scales = [c / 100 for c in centiles] self._wh_pairs = [] - clamp01 = (lambda x: max(min(x, 1.0), 0.0)) if clip else (lambda x: x) for k in range(num_outputs): # Adding the 2 default width-height pairs for aspect ratio 1 and scale s'k - s_k = clamp01(self.scales[k]) - s_prime_k = clamp01(math.sqrt(self.scales[k] * self.scales[k + 1])) + s_k = self.scales[k] + s_prime_k = math.sqrt(self.scales[k] * self.scales[k + 1]) wh_pairs = [(s_k, s_k), (s_prime_k, s_prime_k)] # Adding 2 pairs for each aspect ratio of the feature map k for ar in self.aspect_ratios[k]: sq_ar = math.sqrt(ar) - w = clamp01(self.scales[k] * sq_ar) - h = clamp01(self.scales[k] / sq_ar) + w = self.scales[k] * sq_ar + h = self.scales[k] / sq_ar wh_pairs.extend([(w, h), (h, w)]) self._wh_pairs.append(wh_pairs) @@ -216,12 +216,15 @@ def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Ten cy = (j + 0.5) / f_k[0] for i in range(f_k[1]): cx = (i + 0.5) / f_k[1] - default_boxes.extend([[cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h] - for w, h in self._wh_pairs[k]]) + default_boxes.extend([[cx, cy, w, h] for w, h in self._wh_pairs[k]]) dboxes = [] for _ in image_list.image_sizes: dboxes_in_image = torch.tensor(default_boxes, dtype=dtype, device=device) + if self.clip: + dboxes_in_image.clamp_(min=0, max=1) + dboxes_in_image = torch.cat([dboxes_in_image[:, :2] - 0.5 * dboxes_in_image[:, 2:], + dboxes_in_image[:, :2] + 0.5 * dboxes_in_image[:, 2:]], -1) dboxes_in_image[:, 0::2] *= image_size[1] dboxes_in_image[:, 1::2] *= image_size[0] dboxes.append(dboxes_in_image) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 1a69c9b3ec4..36a2d8fcf40 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -104,7 +104,7 @@ def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: cls_logits = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): cls_logits.append(nn.Conv2d(channels, num_classes * anchors, kernel_size=3, padding=1)) - # _xavier_init(cls_logits) + _xavier_init(cls_logits) super().__init__(cls_logits, num_classes) self.neg_to_pos_ratio = (1.0 - positive_fraction) / positive_fraction @@ -134,11 +134,12 @@ def compute_loss(self, targets: List[Dict[str, Tensor]], cls_logits: Tensor, mat # Hard Negative Sampling foreground_idxs = cls_targets > 0 num_negative = self.neg_to_pos_ratio * foreground_idxs.sum(1, keepdim=True) - num_negative[num_negative < self.neg_to_pos_ratio] = self.neg_to_pos_ratio + # num_negative[num_negative < self.neg_to_pos_ratio] = self.neg_to_pos_ratio negative_loss = cls_loss.clone() negative_loss[foreground_idxs] = -float('inf') # use -inf to detect positive values that creeped in the sample values, idx = negative_loss.sort(1, descending=True) - background_idxs = torch.logical_and(idx.sort(1)[1] < num_negative, torch.isfinite(values)) + # background_idxs = torch.logical_and(idx.sort(1)[1] < num_negative, torch.isfinite(values)) + background_idxs = idx.sort(1)[1] < num_negative loss = (cls_loss[foreground_idxs].sum() + cls_loss[background_idxs].sum()) / max(1, foreground_idxs.sum()) return loss @@ -154,7 +155,7 @@ def __init__(self, in_channels: List[int], num_anchors: List[int], box_coder: de bbox_reg = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): bbox_reg.append(nn.Conv2d(channels, 4 * anchors, kernel_size=3, padding=1)) - # _xavier_init(bbox_reg) + _xavier_init(bbox_reg) super().__init__(bbox_reg, 4) self.box_coder = box_coder From 88bd38fedbd9ce41647c7d9188df833f804d566d Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 23 Apr 2021 20:15:00 +0100 Subject: [PATCH 59/92] Fixing tests. --- .../ModelTester.test_ssd300_vgg16_expect.pkl | Bin 6925 -> 6925 bytes ...odelTester.test_ssd512_resnet50_expect.pkl | Bin 6925 -> 6925 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl b/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl index 4c79e50d862797e661a02d688a275f19410748db..9bb57cb49737eb985ec470ed1969595091c97d0c 100644 GIT binary patch literal 6925 zcmd5>X;>83wgs645m6Bpfe=9jL_kEu=}@(oq7jinK|z$Ug=Xlc8Js6TKw1!yD5yvb zmpC97P{dgRMid0W83i?J)EH;IM!gQn>u%FYf)!uvURi{qXsZ*zp zr<?At8Zw0t z@j^koq)TXmSe9rTn3y1pj@Jo{6NDrRqr(Cd1fqCxT;O~M9f_Ky#~pw0CkIK5Ddsv$Lb4lcPj)mYU2P8k;E5ayL^;kaYEO zvv4zTQ%{pGhlYxyqXnS}La``b$0IIG5GM$mAPh~=k#w{2mgpu(x_e9X zW@*YB5Gsz7HMvKGtVI#(5gpGsV)!TrXNi7FKHN#t|D(LoZ}J1=^2Qkod6NW* zskh{_#9&)bH?6KAC7FAKswyh->e=ayGgj&T$&Iu7qXQL}<4W3p+{BFZ+wA!0I%e_a zFl#T_&35iAVIu^)SdX#0*qZ1ate4vk))HCFW}hiyPlW|cHMoG)W^G}%Z#J>mlsp#r zU;`5*E}%-~TL%lRRbonN z-%V$GyQH(@i_)0ptTg6(H%+~rQGgE_PHb+01#ePU) zq3ua*(1s+oB{7Lz^G#wK#wM{K$0XKlk;H_hE16x=N;WTkCCd+9$ug@}u$qG9tkr)x z^RQXQ&X_G@)!mk{u+vM~?k|_H?V`o(^^HZWwQ&(URPeK7kEfp1?x0;#v8qcy{hm9J}~U9Q!OWjtvsTvD?mZ zEc`$$+q5l~akFEY&_9+HPmN{SU1Qn#$71%=H8CqVCuVaNh}nSYV)p2%h&{V0VvDbc z*s)p>>%LjU&P)}tt4zeqbVcltx`_FxirApXF|49ChRs+L!#q~UF!#6^c9UY5nr#fb zIwXcQo5e8hVl*=^iDpIq(X6Me*1V2l*$q)_eMuC%J}ipe=oZDcJcwj}Z;oWsaw1t_ z_ekd0AY|U(2-)j=A=BF+WV_;oESwjzVe;Bf$aeG+G6#Jjb1aBp4r?OVk%1H^KIuOoWWYr`noPF_T9=p9{9@~>RkLicaV_VM&*p+4hyL3#z?rs*aDnA+e2w2)= z0qfFJz)qYBW66nOETAZqjdKrWreB1z_F{0ZAf`JZh(+53v28B`+4dg-+4nVp>~DJmnZcSs*0wy5 zt)Caj}=R6hPf;C`eWGr{toRedk4wO{}+_MTIJ>cx;-6r zJKDu^ea*6?}*BHM^y4U^Z%!D zrC!JUf7e#>I$~$KO1~02%lmiZ9eEvXrM}YN8Fu6;?T)C-SNfH<5aHwOIT!Qi_JEkDR9B2L70g@4qXFCDQ6vxlqRM`4~GVZ(##qk%TP7r@_ z60S!}(Y-N7p%)b(MRUY+UY_gYBqUvHDSG^Kks@ZGmx+SspwPooq&Zb8H2Pk4Lf#T7 z+RcL$oXv$9`1#h)ikSbE{_xxVjMm3W@wmxM5#MPm#h5sM#j%sl0!-Q`g}&w?1;23F zbGo^5E{yD@@V%9y;E4D1gE&mq=18WndFyJjRF&d%tCqqp9d?k*S`kL3-#7A6Z=|$8 zNQ#v1Llv=}3n$?vq$n<)q0nd1f1@*|Qt(%dKE`P@_AixUcXp)0ZoZrXUwPlrC2P|Rq*w8SfS==JDJiM?v%YMHg+xLGtDe9b*mJX&!;Ikq4gH<)W6Bi ziFHQ2`5r#&;RTx8JqAjRRSo zpyZxXJaE55cQSI2->OaHAEqK=b}lu<<)WFd=2C{FV4ijp|E+fhUY^{^W&Xnjswrpq zw!Lof)hg$Vwx6Sm+Rgkmho8vX;T^dzYUdh#^l`wdiccQ80JrA0P@d*mm@ies_B%r$ zemeuAcT+JaRDyT8#>mgO%;m(|!(P9PzkBu~9hH$SDJh`O}+(6+9Rsl4$Uit8_iu;*X|zsf-G7aY1cS8#exW_Xa>z#o4*6p6*t z;oi`~nOH}_<@+6crYyE=V=B^YC!@!PT%6C6LiB8+;`xtr8%9kI1)MS8oA;aYh_C;( zl0Uz-oO`4z#=Nf^`TIXF#=;+n8b^Fa_g%}V$|{Zc4=%{iGsHWOFobU#4$+`*xr-xG z=+W)d{EX3?seFkShF_VB*TN8J&sry3}FMW zvRfS`Z*S5r=wVb|A6$50jmv(|X>L;u{~^Z&d)9c-jW7@7&7Xm;b_tm1D1rGu{GgNe zJspdQLVnE~vTh$lx@Fz*Zd)OF-?WCU^#Shf?p(6*XykvfEFtRw)1WumnD(Aa#L?_@ z+CNf)SK=Vj`*kHM+cL;JeH)d($fgxbA5x%J9o_dDi-T*EQRwW3bQ5T2zo1?O|md}?sVsl|{+B`=MgWh~aZI657>-&c&;F=3!D;|>W*8Ivh(=TwEUbZw|9W(MR@Rj4-d-Zawe8@6b= zsEUZPuec$V9@zOq6MxQoHq46#g54O5p<73wB6I}w)9v8&OqCX=u~bWB=k(DINtku5j@G;)+)nzIE)`CPy6;trUYiKB1Gd<=B?|%PreNKM9=IfY%fEDR zLPgJ5s*ub?gSjbY`7S~BiqWv>k&LE;&0J~W8u->$^3DD9Vao|H_}xfoZPXP-l(dSn`Cr>PuOsQU;f^Zv{x7LF>gQ6z&1lVUe*me z^~9*Vb(HRPm1579SG3UD8ctJ2!G<4;$^Rsbb3aC#H`a2Fy>C;#{&v2wWH+a>ULSM6 zYv5NI8>8CC8tz}OpvgAl(67fPGB9{f$BW8na8FZ2R^Fv?57l91eV5xesUJ2?F5$(8 zL=+ovgN`1}r|r>i=!tbNIurSXtWV6OVon|3bpgh>7~+WWbTpGWtT$_t^l=|-Da|7G z-T|$eKau9{TK>_uYN+bloA|>nczM=^?yd}j^@5FrhnjS}`XGO&ZVg54n@UzGvN3T( z4@dGx;K`K%vhRa1N*=bA?%CGRe6&*e!vUyItKr=z+Tc551M<3ZleW(bfM>BGN~0Db zNIMMsN1otTZdibRhQ++cQ!V7Ur2tZUWGz~U!)=*daA-OvR~_fm1GCWBYL9M9vuQ|` z4?I3UNu}-%5Z2tFFn@qoc|$!7Drkwm399zJr^=8qxH~YMG^?k==;~q0oFjx~wHeH^ z*HQ9?2`H&QMXolbG-8nRs}&xcs^lsiCtJRRSawSj z?+>p;rdBZx6gQJz&J$XZ-UZ!r?4X+Gjv?n}K&isvESZ91UG|dGHFr#${GLkfqOjjWHW%!n$?7a1XNU%?9N~|iR|CqxBqP3_IvgE>;9^KUDegqr;DRf zA0AIljrUKfKTn(I9~B@Bi3=JZ=pW;69=SAhCiUZazmrxgej?(n5-FH^D$5WSMJx-7 z7AXeAM92iqePd%nLZkVnLxCR(?2*!)aR`T ze`aWef6V;1m3)!1rc2Vh+|+s&-Eo>Z_nEZU=jE%rUg#QF!>(?ti^5~Bm`R=)%6)$)Dp#iD&oao1j^vSKe=E!@i-pY36* zD)+F(CVSY0rRA*By_`)MQ_kj@ma}^8a`sTOoSj~|o4JSYX7?BFW~bhivCmt|Slw4; z>`ZMLD?C!hzDg-$4Z<>ZM^MHhns>4IH>GU&#ZsnlxRgbgmNIchDckZ%DJ!2>%Bme? z`QTEfC!;qirEE!a2@5$;!W1$}m|sc>V^$?>vsnpC8(zYy`j)VPKNqtF9mVX+`^C)R zY%zNxDP~ohikWw0F)Ir$W^RZHGXBV*v!-|;V z3khp!ldyr`OPJYTB}}_n!osdd*yHmOmRBKRlZqv5YrceyUnS#-kT4G`36qYIu#fvn zSi6#h#Xc%zU*9fdMYjss^RtCaTvN!ps|wk}1BI-yq>x2!FJ$ps3t3lYAqy20GQW!j z?8?~!*0;KVmF+BGMHvOGIIVysj3{6QeF|8EVgXC+&S%EW`E1XXd{$VS&mNu0XLGCb znbN_0rY+8AgIDA;g~j>o;N*N3HzuELG|6XnBl6j&xAT~PQy!aEo5yr2^4QaYJm!+0 z$2d_Qiw?lu?|FY@Xz>(7QQoTWJn^v^|HtTA#yq$K^12*j>7R ztjSO|=g8?_h20153%i~$pGn%kh25`nsC+7ur{4LXx7Ght-<$8xD*vzB|D$&G*1vc8 zkK+HQ^1a*dUH(t;{vH1x#ebjt`|AIlKL4x#|1Ew0f%A|2e`U{~-R^(ezW+8}Z~Omj z{6E^>pRJ#KeKL6Z#T$WaGb^WmwLVRH-}=q7OOk6kmy38IdsK(K>P zx>qw{*X$;6-E57d231=5dM+ZT*a@o648mEd3N4Em5BajTec=$8Z#d6=S8a-$?Q;bx zSWo zkTV}$$CPQr`fxmRb`|U}Zl$)9sn3_^>4OeKbr^KT;iI+nWZRkojf)ES@Nb#;psyDm7-?et@pwEQZ;EU6Qw2** zVsMG6kfUlEIx{R0mU4m;7Pw%kpq=bDFN4~pp@Og)@KX!HlDjFGZ=8%3dRaa8 zyjrsWu?DrAs4^JGu0Eqlod&e#xi0FvlE}PGi~5D6!_+T@r2CYcsL#$yUnOxl79iUZrerbvWlLVruJ0^!Nuu zT)&%1m7#MfabXOSy`o`&3!<`it~2GiSHsCH}w!dA>f5xOJ{=wvA3)Ps}UwkSh1n%k3UIEQhjKCp@NLsDftj@G8q;LJdL-%?9T?FmqC zv&HO1>8SPb=<#QBY(LVedd)SgOd^#rLTX=PhP3s|Y5WFz1UxxNb6m4Y-`@wrPd1X% z&UhS%g`mx=oW2ZHpre}H1f^~Bbau-WfXot^q3S0jTb?EM)9i;>C5cbzO@hKY%0NIEIk zk;iMk^(}dZoaNdlDWL6c3uk4f4CAN}YT0gxfE!ITDxj6NK^;M|{$cJMrt-aMhP59Q zB8zAUr;V|jUemM77LX3pMeBVt4A5_-U_*OI*Y?Ny{Xv2=`@ASlSg2_N zB^w(w%$)}nZfU148jPSjx{q*s|X(CLw+n{upGkkxT!#N}#;_g2*q}0P|^h4MwWl z>?7Pqd)0A%@T?wvuk^W1zUp;c&iW2&vQ)%~n%A^V(+HbVbrD-C#DS3ww5cr#cPGev zQ2U(H?|IWx4Ly`+N$Ap!t+aW@r=-KK(sbKO%3L#!zD?Xq>ECRjvH>lW6;?{CTJ5kj zpoN-}J+Qc15suk`(2(KJdX<0^8zV7r`yJ{ynuc=~YPc|;j}Ye=2#icI%r^}~UYcQT zN(`o}*~5ETn#@;I1rt_`1XZZeCl9CM&~gX7+NJ|;kq{2X7Px;l5uDiq4 z9GyZ_?g>e^E|Y?WSCC+~9sxV=OHiZ5jSxAQp#KeWcVQa#w# zI>S(D3RXXEr8*N&Trchqr6p%6r@{$0Rx2PQQ64iqo~K|emJh1s+*a81l#Mm2xY0Z6 zIqe60S`w{6cCFFq*po~aF=-fmR)`bfAJNI?B>be9McY$Da7^fgEw_?TX&8(Tr|}V7 z&_MGn%%F6ehdnp>@Y}S44tLAevhGUCvIs$Bu^nz!Cqa761E(jO)4Y2_G4R7J)LLeP z#}Ui0wA>Ciu4mw@OCB)$+!CtYb}-3xf#`wWT(+RT_ED^9b7=EGsDf#tCOh1xD-tz^> z_OgYmMlPvIJu$MYkxrP(*5_t@Ds*sx`qX5aT;YI}ek;j0L5L`$O6q)BOxvr+qS5#f zY532APKg6@%G~fOREP(&A`ls13I8Wv^i!7$Y#SwX z5xv|BB~?ynlGztJCm7?rIFwvHK@T7JLfYL<^Nr`wH0wDybWBQTDuU6~u7|OPGJllY z;L6L)p7~|eh73yBR>wU#TSPGxx4DT6+;GosDMg1y;Kw6=xSEuX*bG^$%EtdC_vf^D zr4Ibe^iaLU0!PPKVYz)M&Tn%8ZB9bma4*;zr((&s@z`-U6NB}`Nsz9LK@Y2F<7b9& zk6cQAybW}HPbCe?tfh&wwP;wybhrg4Q}{Sv9J#882rV%gT(*HD8tG<`m~(d@3e(ux z^w|y@ObVX|m&ZOhc{u|1ChiLfwuB}U2S5A_up(1>$@(AWf?r-E@_Nq|S0`fxaPk_>;c zLD29H`peFIF8jx5Ox)@~CswAxqBM+5TBpOly^=y!_+sk!o;bX2Jyq4mBiXNpMtv%T z`L-Bbznp|KJLECK?OkL#^ATmA(?wx{5iHugXuxg(u5Fa`_(V0se3JOrhGKI@|K~0 z$vDgzz8=EGi(!HdUGDo{q@FWoZ=~P%oBXNgw{%(eH>6*@<>IKKGWHjaVM^nE<&{4? r$pW$~-y$Ti(9|-Cem> diff --git a/test/expect/ModelTester.test_ssd512_resnet50_expect.pkl b/test/expect/ModelTester.test_ssd512_resnet50_expect.pkl index 160e49d09b1e4dfcceb2d86a5151102c251e5c23..3316f0015b6c9bc5fc5f2e738700bdaa79578c60 100644 GIT binary patch literal 6925 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfho3-p|+4wBZ3uZ zLQ;N3YH=Y`a!Ec=qh5SzNoGzlSA0=wQfX#RN_MksR>;+noSdJNlbT$TnV(n8RmiOwRLE0O z$QxA17r_Q{KyrQ&(8>H6Ko@1OWFXmSXl!n2WMW`pU{ENK!R~#cq);%>o6}pQwNS{L ztF=(LBLifXNJ*h+P@x#utenK8)STi%afn$^ANl0xrNccWf$$K>cuPZL3k!=vNr>@M zP~)XR#>;eqjh8JclnW}9FHO`7@Mh=Gyxqr>$;iL}!T~rVPKJROCF1na0?^Hg6A_8P zAV31soDm!ZyY!_ZP<)!R%x@440|qpU$14gBSIIZJVFk{qEsH$G|J7R)IGyn z|Bm_dolrxZ-u^|kedxzGVgIm%isQ*onEL%r972mDWHq4pg2b8U?2Qf}85oUF1&dQu z`uLM(pt;kWAv8=iObr7=$XX7t*$|q6q2%)tuzCoMsXpN(4_JL7n8uWch#<^I=c{S| z1eAQtsbTV7M`PgSO{Q-8K_&Mvmm*kfro(s zgzH85WO@vCnH3+2BnXRj{~$7SjUax(J&ZI1K834D1SOuTURMCGcbTKzWgbM zBY#4MIozCtu;fqRNCp@fQkiCgnm0;A0%9~ANDBvQr^iwAAmKk64#b6n0Yn*tK8|G z#ZgU~z=zEwM9Ui8B;@c$HOW8-mr0N|0J=%Y;l_{Ry9*Kslc2E`;LXOS163r)tP59; oo+KloZHgdCh!UVa&{Pw{YAFaGBpu+*3Yz@^CTCU#ka~z(0JukoA^-pY literal 6925 zcmeI1eN*^XuM^0Nh7bt2%8F^iAd5m94P3-w_Oz*;buC$YF<%gAbzOD$KELmMWM+!&^v~|uMnL9M;aTB9wfHW$^FS&Iwo=E@pdbwOQ%R?@`A=6fp* zYLuFVrDn+kO|+R-TcQ1DSgDmf^8=JRy2SXzDM>orRNd6%%|ubYx21zNlf@7krH9D}c6M3WS3&^AfKS6W!E z5q9bJTq%TdSqqEI)*5Lfa;YQAwpEti98jq9fD(1ODN6b%_ zDXO+`lF{JplU2Vjt;|gj*kT-X$3+PKPCG8nWdbwPeFt`;G6fDr7C#yPb~;-Tawi!5 zaInan^PHJhf9&^2*}A_nU~Q!?)$knJdew51iMpOeP0q(RWV_IVg)6o1(}; zSxg!|ot$ltEz9o-PY=@M(r>KeJw$xj`9`PuwFhz>>hiwvHk@e7@_D`GPwhFk_IZE%9|;+(Grrs>(JL0R08C6jZ;m&By9I4Fy|vggqF_KS}x zbeumvXM*tCb**X)@f8!6kZ0{)PHkB}US1iCeG0-tTZ!y5mz66Ip z@Vd^7*jr26Vu|9tP<#&zuQ~fVgVg^`yR%G$x?$4EN$YSg7!M(}s7?j1&Qos%60pwS#;LP=- z$v;)Ob3E2-?*w)Cuxp|$KCeTmzoU~=hE^4^bb)yz z*)>fTo5Db2Uc{c9{8#T!kzt%9BfG~a_nI=$5MM}c(18bq67oEE zaQDX}$^Xy;u1pzAzB&VR9j|oAO7Z*N13R|QBmZazxc+J_`Byi<3Db(mtLB5^bEs!h z+XY_x1;wXb2jzSV`P%tlxZmUCJN&@}0oUUrK451D`mta;_}<%dseWKL_~VU_lBaC~ zuN})Kf8heScuyqx#>-&Q#V9(yH(vrfe*PH67v+I(oLxljdk)mrJW2i+JGeEziaam{ zw6=hA*MJFH)H@IWHlJ;z`n`0omqlYIyzv8YcPN-M3XJG`gzBHZ1ez=h$#<%330_U! zqjF?Q4f)(`aNQ?3ZceIv;>*!A-iCAFxXpMD%2cUYRzdBz)q~rQt|C9Ca%m9y_118( zF&Eo^;~{Xz$yTcWqc6axPpu_i-2>`RU>sI9RFy_B(9+Zz1R5*G6IRDf8jeY%KWr#!bFmxRJ|C|Tx{xw2stJNzX6WQ_u z?qBLx#vgY7Qtq0{_o<;}Xby&sfp0qo*!h{Z5rk)N7n;CdYYWfAtKM-q3+e>fSIKyu2o= tH6k_qf$y8Qn%U|zeQp>kEA7}{O~Olez**R6@MQn~D35EN0;`X@{|3cf`nv!C From fad55082e9656e1822ad00e3bdafecf2547daab8 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 26 Apr 2021 11:01:57 +0100 Subject: [PATCH 60/92] Refactor to move the losses from the Head to the SSD. --- torchvision/models/detection/ssd.py | 155 ++++++++++++---------------- 1 file changed, 68 insertions(+), 87 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 36a2d8fcf40..8470f34061e 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -39,19 +39,10 @@ def _xavier_init(conv: nn.Module): class SSDHead(nn.Module): - def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int, positive_fraction: float, - box_coder: det_utils.BoxCoder): + def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int): super().__init__() - self.classification_head = SSDClassificationHead(in_channels, num_anchors, num_classes, positive_fraction) - self.regression_head = SSDRegressionHead(in_channels, num_anchors, box_coder) - - def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], anchors: List[Tensor], - matched_idxs: List[Tensor]) -> Dict[str, Tensor]: - return { - 'bbox_regression': self.regression_head.compute_loss(targets, head_outputs['bbox_regression'], anchors, - matched_idxs), - 'classification': self.classification_head.compute_loss(targets, head_outputs['cls_logits'], matched_idxs), - } + self.classification_head = SSDClassificationHead(in_channels, num_anchors, num_classes) + self.regression_head = SSDRegressionHead(in_channels, num_anchors) def forward(self, x: List[Tensor]) -> Dict[str, Tensor]: return { @@ -100,92 +91,21 @@ def forward(self, x: List[Tensor]) -> Tensor: class SSDClassificationHead(SSDScoringHead): - def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int, positive_fraction: float): + def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int): cls_logits = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): cls_logits.append(nn.Conv2d(channels, num_classes * anchors, kernel_size=3, padding=1)) _xavier_init(cls_logits) super().__init__(cls_logits, num_classes) - self.neg_to_pos_ratio = (1.0 - positive_fraction) / positive_fraction - - def compute_loss(self, targets: List[Dict[str, Tensor]], cls_logits: Tensor, matched_idxs: List[Tensor]) -> Tensor: - # Match original targets with default boxes - cls_targets = [] - for targets_per_image, cls_logits_per_image, matched_idxs_per_image in zip(targets, cls_logits, matched_idxs): - foreground_idxs_per_image = matched_idxs_per_image >= 0 - - gt_classes_target = torch.zeros((cls_logits_per_image.size(0), ), dtype=targets_per_image['labels'].dtype, - device=targets_per_image['labels'].device) - gt_classes_target[foreground_idxs_per_image] = \ - targets_per_image['labels'][matched_idxs_per_image[foreground_idxs_per_image]] - - cls_targets.append(gt_classes_target) - - cls_targets = torch.stack(cls_targets) - - # Calculate loss - num_classes = cls_logits.size(-1) - cls_loss = F.cross_entropy( - cls_logits.view(-1, num_classes), - cls_targets.view(-1), - reduction='none' - ).view(cls_targets.size()) - - # Hard Negative Sampling - foreground_idxs = cls_targets > 0 - num_negative = self.neg_to_pos_ratio * foreground_idxs.sum(1, keepdim=True) - # num_negative[num_negative < self.neg_to_pos_ratio] = self.neg_to_pos_ratio - negative_loss = cls_loss.clone() - negative_loss[foreground_idxs] = -float('inf') # use -inf to detect positive values that creeped in the sample - values, idx = negative_loss.sort(1, descending=True) - # background_idxs = torch.logical_and(idx.sort(1)[1] < num_negative, torch.isfinite(values)) - background_idxs = idx.sort(1)[1] < num_negative - - loss = (cls_loss[foreground_idxs].sum() + cls_loss[background_idxs].sum()) / max(1, foreground_idxs.sum()) - return loss class SSDRegressionHead(SSDScoringHead): - - __annotations__ = { - 'box_coder': det_utils.BoxCoder, - } - - def __init__(self, in_channels: List[int], num_anchors: List[int], box_coder: det_utils.BoxCoder): + def __init__(self, in_channels: List[int], num_anchors: List[int]): bbox_reg = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): bbox_reg.append(nn.Conv2d(channels, 4 * anchors, kernel_size=3, padding=1)) _xavier_init(bbox_reg) super().__init__(bbox_reg, 4) - self.box_coder = box_coder - - def compute_loss(self, targets: List[Dict[str, Tensor]], bbox_regression: Tensor, anchors: List[Tensor], - matched_idxs: List[Tensor]) -> Tensor: - N = 0 - losses = [] - for targets_per_image, bbox_regression_per_image, anchors_per_image, matched_idxs_per_image in \ - zip(targets, bbox_regression, anchors, matched_idxs): - # determine only the foreground indices, ignore the rest - foreground_idxs_per_image = torch.where(matched_idxs_per_image >= 0)[0] - num_foreground = foreground_idxs_per_image.numel() - - # select only the foreground boxes - matched_gt_boxes_per_image = targets_per_image['boxes'][matched_idxs_per_image[foreground_idxs_per_image]] - bbox_regression_per_image = bbox_regression_per_image[foreground_idxs_per_image, :] - anchors_per_image = anchors_per_image[foreground_idxs_per_image, :] - - # compute the regression targets - target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image) - - # compute the loss - N += num_foreground - losses.append(torch.nn.functional.smooth_l1_loss( - bbox_regression_per_image, - target_regression, - reduction='sum' - )) - - return _sum(losses) / max(1, N) class SSDFeatureExtractor(nn.Module): @@ -224,7 +144,7 @@ def __init__(self, backbone: SSDFeatureExtractor, size: Tuple[int, int], num_cla # Estimate num of anchors based on aspect ratios: 2 default boxes + 2 * ratios of feaure map. self.num_anchors = [2 + 2 * len(r) for r in backbone.aspect_ratios] - self.head = SSDHead(out_channels, self.num_anchors, num_classes, positive_fraction, self.box_coder) + self.head = SSDHead(out_channels, self.num_anchors, num_classes) self.anchor_generator = DBoxGenerator(backbone.aspect_ratios) @@ -242,6 +162,7 @@ def __init__(self, backbone: SSDFeatureExtractor, size: Tuple[int, int], num_cla self.nms_thresh = nms_thresh self.detections_per_img = detections_per_img self.topk_candidates = topk_candidates + self.neg_to_pos_ratio = (1.0 - positive_fraction) / positive_fraction # used only on torchscript mode self._has_warned = False @@ -254,6 +175,66 @@ def eager_outputs(self, losses: Dict[str, Tensor], return detections + def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, Tensor], anchors: List[Tensor], + matched_idxs: List[Tensor]) -> Dict[str, Tensor]: + bbox_regression = head_outputs['bbox_regression'] + cls_logits = head_outputs['cls_logits'] + + # Match original targets with default boxes + num_foreground = 0 + bbox_loss = [] + cls_targets = [] + for targets_per_image, bbox_regression_per_image, cls_logits_per_image, anchors_per_image, \ + matched_idxs_per_image in zip(targets, bbox_regression, cls_logits, anchors, matched_idxs): + # produce the matching between boxes and targets + foreground_idxs_per_image = torch.where(matched_idxs_per_image >= 0)[0] + foreground_matched_idxs_per_image = matched_idxs_per_image[foreground_idxs_per_image] + num_foreground += foreground_matched_idxs_per_image.numel() + + # Calculate regression loss + matched_gt_boxes_per_image = targets_per_image['boxes'][foreground_matched_idxs_per_image] + bbox_regression_per_image = bbox_regression_per_image[foreground_idxs_per_image, :] + anchors_per_image = anchors_per_image[foreground_idxs_per_image, :] + target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image) + bbox_loss.append(torch.nn.functional.smooth_l1_loss( + bbox_regression_per_image, + target_regression, + reduction='sum' + )) + + # Estimate ground truth for class targets + gt_classes_target = torch.zeros((cls_logits_per_image.size(0), ), dtype=targets_per_image['labels'].dtype, + device=targets_per_image['labels'].device) + gt_classes_target[foreground_idxs_per_image] = \ + targets_per_image['labels'][foreground_matched_idxs_per_image] + cls_targets.append(gt_classes_target) + + cls_targets = torch.stack(cls_targets) + + # Calculate classification loss + num_classes = cls_logits.size(-1) + cls_loss = F.cross_entropy( + cls_logits.view(-1, num_classes), + cls_targets.view(-1), + reduction='none' + ).view(cls_targets.size()) + + # Hard Negative Sampling + foreground_idxs = cls_targets > 0 + num_negative = self.neg_to_pos_ratio * foreground_idxs.sum(1, keepdim=True) + # num_negative[num_negative < self.neg_to_pos_ratio] = self.neg_to_pos_ratio + negative_loss = cls_loss.clone() + negative_loss[foreground_idxs] = -float('inf') # use -inf to detect positive values that creeped in the sample + values, idx = negative_loss.sort(1, descending=True) + # background_idxs = torch.logical_and(idx.sort(1)[1] < num_negative, torch.isfinite(values)) + background_idxs = idx.sort(1)[1] < num_negative + + N = max(1, num_foreground) + return { + 'bbox_regression': _sum(bbox_loss) / N, + 'classification': (cls_loss[foreground_idxs].sum() + cls_loss[background_idxs].sum()) / N, + } + def forward(self, images: List[Tensor], targets: Optional[List[Dict[str, Tensor]]] = None) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]: if self.training and targets is None: @@ -322,7 +303,7 @@ def forward(self, images: List[Tensor], match_quality_matrix = box_ops.box_iou(targets_per_image['boxes'], anchors_per_image) matched_idxs.append(self.proposal_matcher(match_quality_matrix)) - losses = self.head.compute_loss(targets, head_outputs, anchors, matched_idxs) + losses = self.compute_loss(targets, head_outputs, anchors, matched_idxs) else: detections = self.postprocess_detections(head_outputs, anchors, images.image_sizes) detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes) From 38e6e7244fd6cf66f344cfefe0e3ec15068dbd9e Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 26 Apr 2021 11:38:38 +0100 Subject: [PATCH 61/92] Removing resnet50 ssd version. --- ...odelTester.test_ssd512_resnet50_expect.pkl | Bin 6925 -> 0 bytes test/test_models.py | 1 - torchvision/models/detection/ssd.py | 115 +----------------- 3 files changed, 1 insertion(+), 115 deletions(-) delete mode 100644 test/expect/ModelTester.test_ssd512_resnet50_expect.pkl diff --git a/test/expect/ModelTester.test_ssd512_resnet50_expect.pkl b/test/expect/ModelTester.test_ssd512_resnet50_expect.pkl deleted file mode 100644 index 3316f0015b6c9bc5fc5f2e738700bdaa79578c60..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6925 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfho3-p|+4wBZ3uZ zLQ;N3YH=Y`a!Ec=qh5SzNoGzlSA0=wQfX#RN_MksR>;+noSdJNlbT$TnV(n8RmiOwRLE0O z$QxA17r_Q{KyrQ&(8>H6Ko@1OWFXmSXl!n2WMW`pU{ENK!R~#cq);%>o6}pQwNS{L ztF=(LBLifXNJ*h+P@x#utenK8)STi%afn$^ANl0xrNccWf$$K>cuPZL3k!=vNr>@M zP~)XR#>;eqjh8JclnW}9FHO`7@Mh=Gyxqr>$;iL}!T~rVPKJROCF1na0?^Hg6A_8P zAV31soDm!ZyY!_ZP<)!R%x@440|qpU$14gBSIIZJVFk{qEsH$G|J7R)IGyn z|Bm_dolrxZ-u^|kedxzGVgIm%isQ*onEL%r972mDWHq4pg2b8U?2Qf}85oUF1&dQu z`uLM(pt;kWAv8=iObr7=$XX7t*$|q6q2%)tuzCoMsXpN(4_JL7n8uWch#<^I=c{S| z1eAQtsbTV7M`PgSO{Q-8K_&Mvmm*kfro(s zgzH85WO@vCnH3+2BnXRj{~$7SjUax(J&ZI1K834D1SOuTURMCGcbTKzWgbM zBY#4MIozCtu;fqRNCp@fQkiCgnm0;A0%9~ANDBvQr^iwAAmKk64#b6n0Yn*tK8|G z#ZgU~z=zEwM9Ui8B;@c$HOW8-mr0N|0J=%Y;l_{Ry9*Kslc2E`;LXOS163r)tP59; oo+KloZHgdCh!UVa&{Pw{YAFaGBpu+*3Yz@^CTCU#ka~z(0JukoA^-pY diff --git a/test/test_models.py b/test/test_models.py index ab62720f8f8..c23087a05e9 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -45,7 +45,6 @@ def get_available_video_models(): "keypointrcnn_resnet50_fpn": lambda x: x[1], "retinanet_resnet50_fpn": lambda x: x[1], "ssd300_vgg16": lambda x: x[1], - "ssd512_resnet50": lambda x: x[1], } diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 8470f34061e..095bf3fe17f 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -15,11 +15,10 @@ from ...ops import boxes as box_ops -__all__ = ['SSD', 'SSDFeatureExtractor', 'ssd300_vgg16', 'ssd512_resnet50'] +__all__ = ['SSD', 'SSDFeatureExtractor', 'ssd300_vgg16'] model_urls = { 'ssd300_vgg16_coco': None, # TODO: Add url with weights - 'ssd512_resnet50_coco': None, } @@ -485,115 +484,3 @@ def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: i state_dict = load_state_dict_from_url(model_urls[weights_name], progress=progress) model.load_state_dict(state_dict) return model - - -class SSDFeatureExtractorResNet(SSDFeatureExtractor): - def __init__(self, backbone: resnet.ResNet): - aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] - super().__init__(aspect_ratios) - - self.features = nn.Sequential( - backbone.conv1, - backbone.bn1, - backbone.relu, - backbone.maxpool, - backbone.layer1, - backbone.layer2, - backbone.layer3, - backbone.layer4, - ) - - # Patch last block's strides to get valid output sizes - for m in self.features[-1][0].modules(): - if hasattr(m, 'stride'): - m.stride = 1 - - backbone_out_channels = self.features[-1][-1].bn3.num_features - extra = nn.ModuleList([ - nn.Sequential( - nn.Conv2d(backbone_out_channels, 256, kernel_size=1, bias=False), - nn.BatchNorm2d(256), - nn.ReLU(inplace=True), - nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2, bias=False), - nn.BatchNorm2d(512), - nn.ReLU(inplace=True), - ), - nn.Sequential( - nn.Conv2d(512, 256, kernel_size=1, bias=False), - nn.BatchNorm2d(256), - nn.ReLU(inplace=True), - nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2, bias=False), - nn.BatchNorm2d(512), - nn.ReLU(inplace=True), - ), - nn.Sequential( - nn.Conv2d(512, 128, kernel_size=1, bias=False), - nn.BatchNorm2d(128), - nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2, bias=False), - nn.BatchNorm2d(256), - nn.ReLU(inplace=True), - ), - nn.Sequential( - nn.Conv2d(256, 128, kernel_size=1, bias=False), - nn.BatchNorm2d(128), - nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=3, bias=False), - nn.BatchNorm2d(256), - nn.ReLU(inplace=True), - ), - nn.Sequential( - nn.Conv2d(256, 128, kernel_size=1, bias=False), - nn.BatchNorm2d(128), - nn.ReLU(inplace=True), - nn.Conv2d(128, 256, kernel_size=2, bias=False), - nn.ReLU(inplace=True), - ) - ]) - _xavier_init(extra) - self.extra = extra - - def forward(self, x: Tensor) -> Dict[str, Tensor]: - x = self.features(x) - output = [x] - - for block in self.extra: - x = block(x) - output.append(x) - - return OrderedDict([(str(i), v) for i, v in enumerate(output)]) - - -def _resnet_extractor(backbone_name: str, pretrained: bool, trainable_layers: int): - backbone = resnet.__dict__[backbone_name](pretrained=pretrained) - - # select layers that wont be frozen - assert 0 <= trainable_layers <= 5 - layers_to_train = ['layer4', 'layer3', 'layer2', 'layer1', 'conv1'][:trainable_layers] - if trainable_layers == 5: - layers_to_train.append('bn1') - for name, parameter in backbone.named_parameters(): - if all([not name.startswith(layer) for layer in layers_to_train]): - parameter.requires_grad_(False) - - return SSDFeatureExtractorResNet(backbone) - - -def ssd512_resnet50(pretrained: bool = False, progress: bool = True, num_classes: int = 91, - pretrained_backbone: bool = True, trainable_backbone_layers: Optional[int] = None, **kwargs: Any): - trainable_backbone_layers = _validate_trainable_layers( - pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3) - - if pretrained: - # no need to download the backbone if pretrained is set - pretrained_backbone = False - - backbone = _resnet_extractor("resnet50", pretrained_backbone, trainable_backbone_layers) - model = SSD(backbone, (512, 512), num_classes, **kwargs) - if pretrained: - weights_name = 'ssd512_resnet50_coco' - if model_urls.get(weights_name, None) is None: - raise ValueError("No checkpoint is available for model {}".format(weights_name)) - state_dict = load_state_dict_from_url(model_urls[weights_name], progress=progress) - model.load_state_dict(state_dict) - return model From 30de463bf3e9e1969c7b7bc83c874c5ea9e81fe8 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 26 Apr 2021 13:05:37 +0100 Subject: [PATCH 62/92] Adding support for best performing backbone and its config. --- .../ModelTester.test_ssd300_vgg16_expect.pkl | Bin 6925 -> 6925 bytes torchvision/models/detection/anchor_utils.py | 9 +++-- torchvision/models/detection/ssd.py | 31 ++++++++++++++---- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl b/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl index 9bb57cb49737eb985ec470ed1969595091c97d0c..f96b9055ce7380d0909629ca6c2d67d760da9729 100644 GIT binary patch literal 6925 zcmbVR30RG3`#)MlrKluSl1dBhLxt|nb5n6bqNLKEwxiQotJ88SN-344=Cfo=GuFWj zhU^AYGgQ_O5tEQ@YGShUo%0@#q5t1^{jdLXUC(vD_x8J&=REIo-uLzAsH(A8{ra)~ z#~8@cV?_yK5)vhG4zW?9DEri8t~b$Pg?uq`Rp#?q+5A4%!Rm5^XkL1pkgpOW;>jKD zBgCRau23^V5Em^@L4?Fvldo!R9jqaD5Xvp1;^X*gN*7IUE-y+nTPf7! zs|V}KZJb;k$GN$?k9QgG;_T+?#_t=fDwoEjiuoGT%~VDFezQ0h90QJ8A-_Mzn$xF{ zuUQlm!{c(}Vnm6&6rtu!L2R5LE_QNaj7XEOWfQ>H7V!rJ@O6UwDiVm{3FIm3Cde~N zP)q0)(bd__#nF{NFhPS;FX9iH#p%y6DB$aHGz<9pMF|Ql0}+340N;?o;zmWsafSRL zO016+`SDWXKNV#3WkKT{UA}P{#~-SMANCP`xB}j|n1MGD@l6BxBg9em{v3^dXwoz` zu@8%-nEqc~IAhj;FBZ<>Qv;Rdc$5DBEMoWr8La=e3?4ShVA~lPM4ggB{C6_=ra}ff zTp65Cl0i<43@m2Jz{64oa|X*`q>2pE-{t`4c@C)7=0H+)4n&mXKv-@Lblc^Cg zxStIR+p?i(LpD?tXM^^VY^a`-4c)V{;n9?AFr6Tm*=0jgR~9rp%mUW$S%7D=K&LJX z64uMFOR`{Dcot;N%YyA1S@5De6GolQ1odN?uzp@9EceNTzAvR9X_rFd5h+~wK?=N0 zQaH9)3KS_i&K9m7#t202uIs+^W zGQjIzI$XV#4o#=i!S_HqR0OAkY-TznxTV9q{b^uVod(OZ)4<+84gTPy!N{>`a7iZ( z#BCDz>!busk4oUuehC!qlfdW|5(q1iz~N#ElyfEEJ4^zzUyEULj~MJ)#Blk#7=D#c z|AS(XY!-vIPz-}OVt7753>zKAaDKlC@+XQwMMDJEH-*rAR0t~$2|@pW5Jqno!mMxQ zvSmV;5iOMGB80Q@DKHg+xuFp9`v}40PXSzR6aagd0Lm%_FrruheToFoKS==oF2}H+^$sE`y>@!9ZChg?^2;AG8KZxroy4YsgT~ugYpg@ zJZbnp&8ZHE_XIu~;;liD2E(GLo;rdiAsCsc>&#Pp3 z(U}Zpcja?mGDK`o2HD1BI24@>dQr*H5tt03!O5^#Ga0V#O9FHGoV_~<;x;6KM}89U zl9S+^QxfD_C&9=;NibqdB533%f}JE0o`ogC#+iw*(k~JEn#-?mCqTf(1Snsf0HHAn zpxAbaFLheX+vAeh*TN;Y#v}QaPo)_5WA?YyMPD zDa9MxE02*Md1}{NuegUQ2KA?Ql$tBmENVyP6o=wdJ1VDml&AibQa$DS_liStDWy2n zj#BEc^A&#@r+40zr}|MJ{b@XkNA*-r<7xJaNAakhQtD4BwWB;BJ=Ie`8rSw?{K38Y-W%s@{i1dD&X4-j z{!o9)_qL~b(R`_%`q4ZoE|t?ds2{D9QYxo;>i6n5#i5kqQ$Lyq&5z%{&an)p2{g6)l)g`C(WDkbX}-j z?|f;Vl=ki`jZgh-KCT;$L-DDe%4t06PjRU|jn{i!6z9{C$O9|9`7Ats}yVsl*%ep3;EZH==T{~x&VC>_8eR(-V? z-p}0$MRx1S^V!uz{dOliLC+DlzIP*A+_$rP7QG^luDLkiygF9c{!D&|vmG})h(>)IsaiW79n@cu^mcD7yRo07?n=X3uO?ykU0(*jVUrk_YFlA&NC>u?W#Bxg ze4O;J0vy(4hZ}CKAu8)~QEKarH3Cbt(vcFamS|j%H6Men$jBeB1Z?kzzh${&l3;GM7BEw!{?=w~)mXj<7wY{uo6T zp+`s+;O8ZH^5hv}$Tq@V(?_6lUOu{iHwImejo7Mxg`!7QDf!FW6NR_*alal9 zXO9x%Cc%6LqngFX^i@O2()&&%apxm)@cdYuf8zsLqq&LsJ$CNV!Tag;M0Q&fqaM4m z9ma=YP)i$omMn?!pPZbJcb^tBwj;t%k-#1eT=`ooDLUzilTOCs_S4}^{Z0>7;f>Tj z%(YpP9d>8fpvnOsvVfC_d)L*Ius2&t`vfNpN=w1X(e7w)0kJvMlXXKwQS&!PviZ_?u{sD`$KZSY<}%*)s#AcwkHzTfx_VKY-B*{%T@) zADK7t=?ui&kt%4Xla3vQ8JKJJo~ZW^$IZsxO#T`nD^Yq-vB#@$#(Ou0Zv57jskbYx zh#c?G#r8+>IO41=qx&I8orzJ@9*Q$-^hoCbPZVvq%@(HyW6GU!68+i+n>*{g$hIWh zH{~Ha;o*FoUhtGOPEA6cnZs~W^F^Zbd^eNdlU+p&))4PwWKHE%bk}}HDo?WIaagRC zc5~5Lrc1V42BXkGxfd4NTVZTe4SA#1NwUh~adzDY;#gqgwYWy1+YR65n$S|Mn zdHxpp<+%Y?B|T^G)webg(@HJ0T-{DskE|H{MjgPLX;EbC+6uJERMu9YKMQ-L7G&aM z`ERC18S}*3aNKeu4qKlt#v8F&*w3SzgxVLPq}U7pzPp0K`ZeAHTAw4`>CZ4cIw zd5g?&PkcO9N86xhStaiL@fuU_=zV92-n~FPdR~g#50~I=qhd6>G>vVX?u#{&O=Q8- zrKtDF1N&}ok>_27op0?i_n;~1$u}XjoQYVQ^o-0648TZ>eWZ9D7YE+FPdX!)VNbX! zWB=^-PfWcY9wFrUkLf5mzntM~k16(G6_y;2#U<|Zn4C+@#kk*iDWhK>6oc3QScTF= zYLI(Zb5GRgAsDw@wSmj?FP-Vkrv^EbQ^3Q6f4u%*KQd15hSE3!+(L z(0t`JvbTE?8NJvElSg!rClW52)y`)xa9)C3(nemn))3znPQY?{UVt zH+K--Y#y@zb(uso#NwOfi?GwC0*6MFlTU)GAwKxcldStmv2R(d?zyqJ zs&p%(GdFrkxG(pT#~S^y@?1IQ7pzkDzpk7#?b63BLyC#b)et6YTQ^wDTw;?3?GSBahy^Tln7F*1A$iwi%lX0c7J2o8nojqSufIZu+Nm9*GvSaETCYJZqTcUnVPmZwjmzqjr;@o;)R^ z);5vQw_)grH^}I6J~le5FuhzVJxuP{?IvoEuMjJt@_w#9XBoB)IVSQnaeK!jPUq zeE-%LvpHjt8}C9+HI?At_*|l6T!7zqjlqtQ<@ioxI>rRd!LIrtxcg8HmL_>%@`X|+ zrXrr|JQHU9nl^|TeD=j0?A|#A>&tY=mPd>5)T(6E2@c0v!$#t5nT%^~)EHmM-ldGa z-^+5;o~wtyt(}YGE{ho1lJY_F|Ar-GiBAv?Z%AhByPjq+x{7tmI_KO|^r;Hl7Y{{4 zNiNxcX%8{VyF^w@a$)2z9Ud|3yY;a)HXU1rcdmV4_`Y^_ILBfP^3PeK*ZK+^{Mr@M zStU%2j?7iqEgDA3x_ua4fj^CNm_6+-L37KaWJO#!u2;QB#OkB4-KCb4+s(vo=LWL4 zG!h@!1!3Wt#i*Xo#n%H|u&phOB!=m=IBfWn*2? zJ|`HTW+mbvi!$8jwFvi2&c)meWj{~falqeyosO4WTZvd4jc0@(h(mZDIyK&A_HWjd z8RUMeBgWn{L+`OOaA&FqDxQB__dYn3tIc95=GUKpbmhs>?-`Offl&T!z?;DTGOVniPWt1H6-OTW^bg=`=*M7+|p!AD$G0(s}a6C`Q0gC6)UrJ~gp+3{&z7wLeFlc_W}M_fps^ GZT|xrTrQaa literal 6925 zcmd5>X;>83wgs645m6Bpfe=9jL_kEu=}@(oq7jinK|z$Ug=Xlc8Js6TKw1!yD5yvb zmpC97P{dgRMid0W83i?J)EH;IM!gQn>u%FYf)!uvURi{qXsZ*zp zr<?At8Zw0t z@j^koq)TXmSe9rTn3y1pj@Jo{6NDrRqr(Cd1fqCxT;O~M9f_Ky#~pw0CkIK5Ddsv$Lb4lcPj)mYU2P8k;E5ayL^;kaYEO zvv4zTQ%{pGhlYxyqXnS}La``b$0IIG5GM$mAPh~=k#w{2mgpu(x_e9X zW@*YB5Gsz7HMvKGtVI#(5gpGsV)!TrXNi7FKHN#t|D(LoZ}J1=^2Qkod6NW* zskh{_#9&)bH?6KAC7FAKswyh->e=ayGgj&T$&Iu7qXQL}<4W3p+{BFZ+wA!0I%e_a zFl#T_&35iAVIu^)SdX#0*qZ1ate4vk))HCFW}hiyPlW|cHMoG)W^G}%Z#J>mlsp#r zU;`5*E}%-~TL%lRRbonN z-%V$GyQH(@i_)0ptTg6(H%+~rQGgE_PHb+01#ePU) zq3ua*(1s+oB{7Lz^G#wK#wM{K$0XKlk;H_hE16x=N;WTkCCd+9$ug@}u$qG9tkr)x z^RQXQ&X_G@)!mk{u+vM~?k|_H?V`o(^^HZWwQ&(URPeK7kEfp1?x0;#v8qcy{hm9J}~U9Q!OWjtvsTvD?mZ zEc`$$+q5l~akFEY&_9+HPmN{SU1Qn#$71%=H8CqVCuVaNh}nSYV)p2%h&{V0VvDbc z*s)p>>%LjU&P)}tt4zeqbVcltx`_FxirApXF|49ChRs+L!#q~UF!#6^c9UY5nr#fb zIwXcQo5e8hVl*=^iDpIq(X6Me*1V2l*$q)_eMuC%J}ipe=oZDcJcwj}Z;oWsaw1t_ z_ekd0AY|U(2-)j=A=BF+WV_;oESwjzVe;Bf$aeG+G6#Jjb1aBp4r?OVk%1H^KIuOoWWYr`noPF_T9=p9{9@~>RkLicaV_VM&*p+4hyL3#z?rs*aDnA+e2w2)= z0qfFJz)qYBW66nOETAZqjdKrWreB1z_F{0ZAf`JZh(+53v28B`+4dg-+4nVp>~DJmnZcSs*0wy5 zt)Caj}=R6hPf;C`eWGr{toRedk4wO{}+_MTIJ>cx;-6r zJKDu^ea*6?}*BHM^y4U^Z%!D zrC!JUf7e#>I$~$KO1~02%lmiZ9eEvXrM}YN8Fu6;?T)C-SNfH<5aHwOIT!Qi_JEkDR9B2L70g@4qXFCDQ6vxlqRM`4~GVZ(##qk%TP7r@_ z60S!}(Y-N7p%)b(MRUY+UY_gYBqUvHDSG^Kks@ZGmx+SspwPooq&Zb8H2Pk4Lf#T7 z+RcL$oXv$9`1#h)ikSbE{_xxVjMm3W@wmxM5#MPm#h5sM#j%sl0!-Q`g}&w?1;23F zbGo^5E{yD@@V%9y;E4D1gE&mq=18WndFyJjRF&d%tCqqp9d?k*S`kL3-#7A6Z=|$8 zNQ#v1Llv=}3n$?vq$n<)q0nd1f1@*|Qt(%dKE`P@_AixUcXp)0ZoZrXUwPlrC2P|Rq*w8SfS==JDJiM?v%YMHg+xLGtDe9b*mJX&!;Ikq4gH<)W6Bi ziFHQ2`5r#&;RTx8JqAjRRSo zpyZxXJaE55cQSI2->OaHAEqK=b}lu<<)WFd=2C{FV4ijp|E+fhUY^{^W&Xnjswrpq zw!Lof)hg$Vwx6Sm+Rgkmho8vX;T^dzYUdh#^l`wdiccQ80JrA0P@d*mm@ies_B%r$ zemeuAcT+JaRDyT8#>mgO%;m(|!(P9PzkBu~9hH$SDJh`O}+(6+9Rsl4$Uit8_iu;*X|zsf-G7aY1cS8#exW_Xa>z#o4*6p6*t z;oi`~nOH}_<@+6crYyE=V=B^YC!@!PT%6C6LiB8+;`xtr8%9kI1)MS8oA;aYh_C;( zl0Uz-oO`4z#=Nf^`TIXF#=;+n8b^Fa_g%}V$|{Zc4=%{iGsHWOFobU#4$+`*xr-xG z=+W)d{EX3?seFkShF_VB*TN8J&sry3}FMW zvRfS`Z*S5r=wVb|A6$50jmv(|X>L;u{~^Z&d)9c-jW7@7&7Xm;b_tm1D1rGu{GgNe zJspdQLVnE~vTh$lx@Fz*Zd)OF-?WCU^#Shf?p(6*XykvfEFtRw)1WumnD(Aa#L?_@ z+CNf)SK=Vj`*kHM+cL;JeH)d($fgxbA5x%J9o_dDi-T*EQRwW3bQ5T2zo1?O|md}?sVsl|{+B`=MgWh~aZI657>-&c&;F=3!D;|>W*8Ivh(=TwEUbZw|9W(MR@Rj4-d-Zawe8@6b= zsEUZPuec$V9@zOq6MxQoHq46#g54O5p<73wB6I}w)9v8&OqCX=u~bWB=k(DINtku5j@G;)+)nzIE)`CPy6;trUYiKB1Gd<=B?|%PreNKM9=IfY%fEDR zLPgJ5s*ub?gSjbY`7S~BiqWv>k&LE;&0J~W8u->$^3DD9Vao|H_}xfoZPXP-l(dSn`Cr>PuOsQU;f^Zv{x7LF>gQ6z&1lVUe*me z^~9*Vb(HRPm1579SG3UD8ctJ2!G<4;$^Rsbb3aC#H`a2Fy>C;#{&v2wWH+a>ULSM6 zYv5NI8>8CC8tz}OpvgAl(67fPGB9{f$BW8na8FZ2R^Fv?57l91eV5xesUJ2?F5$(8 zL=+ovgN`1}r|r>i=!tbNIurSXtWV6OVon|3bpgh>7~+WWbTpGWtT$_t^l=|-Da|7G z-T|$eKau9{TK>_uYN+bloA|>nczM=^?yd}j^@5FrhnjS}`XGO&ZVg54n@UzGvN3T( z4@dGx;K`K%vhRa1N*=bA?%CGRe6&*e!vUyItKr=z+Tc551M<3ZleW(bfM>BGN~0Db zNIMMsN1otTZdibRhQ++cQ!V7Ur2tZUWGz~U!)=*daA-OvR~_fm1GCWBYL9M9vuQ|` z4?I3UNu}-%5Z2tFFn@qoc|$!7Drkwm399zJr^=8qxH~YMG^?k==;~q0oFjx~wHeH^ z*HQ9?2`H&QMXolbG-8nRs}&xcs^lsiCtJRRSawSj z?+>p;rdBZx6gQJz&J$XZ-UZ!r?4X+Gjv?n}K&isvESZ91UG|dGHFr#${GLkfqOjjWHW%! List[Ten class DBoxGenerator(nn.Module): def __init__(self, aspect_ratios: List[List[int]], min_ratio: float = 0.15, max_ratio: float = 0.9, - clip: bool = True): + steps: Optional[List[int]] = None, clip: bool = True): super().__init__() + if steps is not None: + assert len(aspect_ratios) == len(steps) self.aspect_ratios = aspect_ratios + self.steps = steps self.clip = clip num_outputs = len(aspect_ratios) @@ -213,9 +216,9 @@ def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Ten for k, f_k in enumerate(grid_sizes): # Now add the default boxes for each width-height pair for j in range(f_k[0]): - cy = (j + 0.5) / f_k[0] + cy = (j + 0.5) / (float(f_k[0]) if self.steps is None else image_size[1] / self.steps[k]) for i in range(f_k[1]): - cx = (i + 0.5) / f_k[1] + cx = (i + 0.5) / (float(f_k[1]) if self.steps is None else image_size[0] / self.steps[k]) default_boxes.extend([[cx, cy, w, h] for w, h in self._wh_pairs[k]]) dboxes = [] diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 095bf3fe17f..5788b815f14 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -10,7 +10,7 @@ from .anchor_utils import DBoxGenerator from .backbone_utils import _validate_trainable_layers from .transform import GeneralizedRCNNTransform -from .. import vgg, resnet +from .. import vgg from ..utils import load_state_dict_from_url from ...ops import boxes as box_ops @@ -21,6 +21,12 @@ 'ssd300_vgg16_coco': None, # TODO: Add url with weights } +backbone_urls = { + # We port the features of a VGG16 backbone trained by amdegroot because unlike one on TorchVision, it uses the same + # input standardization method as the paper. Ref: https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth + 'vgg16_features': 'https://download.pytorch.org/models/vgg16_features-amdegroot.pth' # TODO: upload +} + def _sum(x: List[Tensor]) -> Tensor: res = x[0] @@ -121,6 +127,7 @@ class SSD(nn.Module): def __init__(self, backbone: SSDFeatureExtractor, size: Tuple[int, int], num_classes: int, image_mean: Optional[List[float]] = None, image_std: Optional[List[float]] = None, + dbox_steps: Optional[List[int]] = None, score_thresh: float = 0.01, nms_thresh: float = 0.45, detections_per_img: int = 200, @@ -136,6 +143,8 @@ def __init__(self, backbone: SSDFeatureExtractor, size: Tuple[int, int], num_cla out_channels = [x[1] for x in tmp_sizes] assert len(out_channels) == len(backbone.aspect_ratios) + if dbox_steps is not None: + assert len(out_channels) == len(dbox_steps) self.backbone = backbone @@ -145,7 +154,7 @@ def __init__(self, backbone: SSDFeatureExtractor, size: Tuple[int, int], num_cla self.num_anchors = [2 + 2 * len(r) for r in backbone.aspect_ratios] self.head = SSDHead(out_channels, self.num_anchors, num_classes) - self.anchor_generator = DBoxGenerator(backbone.aspect_ratios) + self.anchor_generator = DBoxGenerator(backbone.aspect_ratios, steps=dbox_steps) self.proposal_matcher = det_utils.DBoxMatcher(iou_thresh) @@ -448,8 +457,17 @@ def forward(self, x: Tensor) -> Dict[str, Tensor]: return OrderedDict([(str(i), v) for i, v in enumerate(output)]) -def _vgg_extractor(backbone_name: str, highres: bool, pretrained: bool, trainable_layers: int): - backbone = vgg.__dict__[backbone_name](pretrained=pretrained).features +def _vgg_extractor(backbone_name: str, highres: bool, progress: bool, pretrained: bool, trainable_layers: int): + if backbone_name in backbone_urls: + # Use custom backbones more appropriate for SSD + arch = backbone_name.split('_')[0] + backbone = vgg.__dict__[arch](pretrained=False, progress=progress).features + if pretrained: + state_dict = load_state_dict_from_url(backbone_urls[backbone_name], progress=progress) + backbone.load_state_dict(state_dict) + else: + # Use standard backbones from TorchVision + backbone = vgg.__dict__[backbone_name](pretrained=pretrained, progress=progress).features # Gather the indices of maxpools. These are the locations of output blocks. stage_indices = [i for i, b in enumerate(backbone) if isinstance(b, nn.MaxPool2d)] @@ -475,8 +493,9 @@ def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: i # no need to download the backbone if pretrained is set pretrained_backbone = False - backbone = _vgg_extractor("vgg16", False, pretrained_backbone, trainable_backbone_layers) - model = SSD(backbone, (300, 300), num_classes, **kwargs) + backbone = _vgg_extractor("vgg16_features", False, progress, pretrained_backbone, trainable_backbone_layers) + model = SSD(backbone, (300, 300), num_classes, image_mean=[123., 117., 104.], image_std=[1., 1., 1.], + dbox_steps=[8, 16, 32, 64, 100, 300], **kwargs) if pretrained: weights_name = 'ssd300_vgg16_coco' if model_urls.get(weights_name, None) is None: From cdcbbcdaf64f0750bd82f23a87801b6c09a533b9 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 26 Apr 2021 13:16:58 +0100 Subject: [PATCH 63/92] Refactor and clean up the API. --- torchvision/models/detection/ssd.py | 33 ++++++++++------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 5788b815f14..cdcf44f3884 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -15,7 +15,7 @@ from ...ops import boxes as box_ops -__all__ = ['SSD', 'SSDFeatureExtractor', 'ssd300_vgg16'] +__all__ = ['SSD', 'ssd300_vgg16'] model_urls = { 'ssd300_vgg16_coco': None, # TODO: Add url with weights @@ -113,21 +113,14 @@ def __init__(self, in_channels: List[int], num_anchors: List[int]): super().__init__(bbox_reg, 4) -class SSDFeatureExtractor(nn.Module): - def __init__(self, aspect_ratios: List[List[int]]): - super().__init__() - self.aspect_ratios = aspect_ratios - - class SSD(nn.Module): __annotations__ = { 'box_coder': det_utils.BoxCoder, 'proposal_matcher': det_utils.Matcher, } - def __init__(self, backbone: SSDFeatureExtractor, size: Tuple[int, int], num_classes: int, + def __init__(self, backbone: nn.Module, anchor_generator: DBoxGenerator, size: Tuple[int, int], num_classes: int, image_mean: Optional[List[float]] = None, image_std: Optional[List[float]] = None, - dbox_steps: Optional[List[int]] = None, score_thresh: float = 0.01, nms_thresh: float = 0.45, detections_per_img: int = 200, @@ -142,20 +135,18 @@ def __init__(self, backbone: SSDFeatureExtractor, size: Tuple[int, int], num_cla tmp_sizes = [x.size() for x in backbone(tmp_img).values()] out_channels = [x[1] for x in tmp_sizes] - assert len(out_channels) == len(backbone.aspect_ratios) - if dbox_steps is not None: - assert len(out_channels) == len(dbox_steps) + assert len(out_channels) == len(anchor_generator.aspect_ratios) self.backbone = backbone + self.anchor_generator = anchor_generator + self.box_coder = det_utils.BoxCoder(weights=(10., 10., 5., 5.)) # Estimate num of anchors based on aspect ratios: 2 default boxes + 2 * ratios of feaure map. - self.num_anchors = [2 + 2 * len(r) for r in backbone.aspect_ratios] + self.num_anchors = [2 + 2 * len(r) for r in anchor_generator.aspect_ratios] self.head = SSDHead(out_channels, self.num_anchors, num_classes) - self.anchor_generator = DBoxGenerator(backbone.aspect_ratios, steps=dbox_steps) - self.proposal_matcher = det_utils.DBoxMatcher(iou_thresh) if image_mean is None: @@ -372,12 +363,9 @@ def postprocess_detections(self, head_outputs: Dict[str, Tensor], image_anchors: return detections -class SSDFeatureExtractorVGG(SSDFeatureExtractor): +class SSDFeatureExtractorVGG(nn.Module): def __init__(self, backbone: nn.Module, highres: bool): - aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] - if highres: - aspect_ratios.append([2]) - super().__init__(aspect_ratios) + super().__init__() _, _, maxpool3_pos, maxpool4_pos, _ = (i for i, layer in enumerate(backbone) if isinstance(layer, nn.MaxPool2d)) @@ -494,8 +482,9 @@ def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: i pretrained_backbone = False backbone = _vgg_extractor("vgg16_features", False, progress, pretrained_backbone, trainable_backbone_layers) - model = SSD(backbone, (300, 300), num_classes, image_mean=[123., 117., 104.], image_std=[1., 1., 1.], - dbox_steps=[8, 16, 32, 64, 100, 300], **kwargs) + anchor_generator = DBoxGenerator([[2], [2, 3], [2, 3], [2, 3], [2], [2]], steps=[8, 16, 32, 64, 100, 300]) + model = SSD(backbone, anchor_generator, (300, 300), num_classes, + image_mean=[123., 117., 104.], image_std=[1., 1., 1.], **kwargs) if pretrained: weights_name = 'ssd300_vgg16_coco' if model_urls.get(weights_name, None) is None: From efebeb54f96bd8b3dfa0b6c870058df1fd2c2536 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 26 Apr 2021 13:24:33 +0100 Subject: [PATCH 64/92] Fix lint --- torchvision/models/detection/ssd.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index cdcf44f3884..3a289089d48 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -14,7 +14,6 @@ from ..utils import load_state_dict_from_url from ...ops import boxes as box_ops - __all__ = ['SSD', 'ssd300_vgg16'] model_urls = { @@ -183,8 +182,8 @@ def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, num_foreground = 0 bbox_loss = [] cls_targets = [] - for targets_per_image, bbox_regression_per_image, cls_logits_per_image, anchors_per_image, \ - matched_idxs_per_image in zip(targets, bbox_regression, cls_logits, anchors, matched_idxs): + for (targets_per_image, bbox_regression_per_image, cls_logits_per_image, anchors_per_image, + matched_idxs_per_image) in zip(targets, bbox_regression, cls_logits, anchors, matched_idxs): # produce the matching between boxes and targets foreground_idxs_per_image = torch.where(matched_idxs_per_image >= 0)[0] foreground_matched_idxs_per_image = matched_idxs_per_image[foreground_idxs_per_image] From 90e7b676fc5eb9ff70d84c2656cddf02500fc3db Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 26 Apr 2021 13:30:10 +0100 Subject: [PATCH 65/92] Update todos and comments. --- torchvision/models/detection/ssd.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 3a289089d48..a3b95697f72 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -21,9 +21,9 @@ } backbone_urls = { - # We port the features of a VGG16 backbone trained by amdegroot because unlike one on TorchVision, it uses the same - # input standardization method as the paper. Ref: https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth - 'vgg16_features': 'https://download.pytorch.org/models/vgg16_features-amdegroot.pth' # TODO: upload + # We port the features of a VGG16 backbone trained by amdegroot because unlike the one on TorchVision, it uses the + # same input standardization method as the paper. Ref: https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth + 'vgg16_features': 'https://download.pytorch.org/models/vgg16_features-amdegroot.pth' } @@ -153,7 +153,6 @@ def __init__(self, backbone: nn.Module, anchor_generator: DBoxGenerator, size: T if image_std is None: image_std = [0.229, 0.224, 0.225] self.transform = GeneralizedRCNNTransform(min(size), max(size), image_mean, image_std, - # TODO: Discuss/refactor these workarounds size_divisible=1, fixed_size=size) self.score_thresh = score_thresh From 8ec186e0b36743ca76371a19ec0d338ed875380a Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 26 Apr 2021 20:26:48 +0100 Subject: [PATCH 66/92] Adding RandomHorizontalFlip and RandomIoUCrop transforms. --- torchvision/models/detection/transform.py | 91 ++++++++++++++++++++++- 1 file changed, 87 insertions(+), 4 deletions(-) diff --git a/torchvision/models/detection/transform.py b/torchvision/models/detection/transform.py index 3218380699b..6d404372ce1 100644 --- a/torchvision/models/detection/transform.py +++ b/torchvision/models/detection/transform.py @@ -1,8 +1,10 @@ import math import torch -from torch import nn, Tensor -from torch.nn import functional as F import torchvision + +from torch import nn, Tensor +from torchvision.transforms import functional as F +from torchvision.transforms import transforms as T from typing import List, Tuple, Dict, Optional from .image_list import ImageList @@ -55,8 +57,8 @@ def _resize_image_and_masks(image: Tensor, self_min_size: float, self_max_size: if "masks" in target: mask = target["masks"] - mask = F.interpolate(mask[:, None].float(), size=size, scale_factor=scale_factor, - recompute_scale_factor=recompute_scale_factor)[:, 0].byte() + mask = torch.nn.functional.interpolate(mask[:, None].float(), size=size, scale_factor=scale_factor, + recompute_scale_factor=recompute_scale_factor)[:, 0].byte() target["masks"] = mask return image, target @@ -288,3 +290,84 @@ def resize_boxes(boxes, original_size, new_size): ymin = ymin * ratio_height ymax = ymax * ratio_height return torch.stack((xmin, ymin, xmax, ymax), dim=1) + + +class RandomHorizontalFlip(T.RandomHorizontalFlip): + def forward(self, img: Tensor, + target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if torch.rand(1) < self.p: + img = F.hflip(img) + if target is not None: + width, _ = F._get_image_size(img) + target["boxes"][:, [0, 2]] = width - target["boxes"][:, [2, 0]] + return img, target + + +class RandomIoUCrop(nn.Module): + def __init__(self, min_scale: float = 0.3, max_scale: float = 1.0, min_aspect_ratio: float = 0.5, + max_aspect_ratio: float = 2.0, sampler_options: Optional[List[float]] = None, trials: int = 40): + super().__init__() + # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174 + self.min_scale = min_scale + self.max_scale = max_scale + self.min_aspect_ratio = min_aspect_ratio + self.max_aspect_ratio = max_aspect_ratio + if sampler_options is None: + sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0] + self.options = sampler_options + self.trials = trials + + def forward(self, img: Tensor, + target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + assert target is not None + orig_w, orig_h = F._get_image_size(img) + + while True: + # sample an option + idx = int(torch.randint(low=0, high=len(self.options), size=(1,))) + min_jaccard_overlap = self.options[idx] + if min_jaccard_overlap >= 1.0: # a value larger than 1 encodes the leave as-is option + return img, target + + for _ in range(self.trials): + # check the aspect ratio limitations + r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2) + new_w = int(orig_w * r[0]) + new_h = int(orig_h * r[1]) + aspect_ratio = new_w / new_h + if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio): + continue + + # check for 0 area crops + r = torch.rand(2) + left = int((orig_w - new_w) * r[0]) + top = int((orig_h - new_h) * r[1]) + right = left + new_w + bottom = top + new_h + if left == right or top == bottom: + continue + + # check for any valid boxes with centers within the crop area + cx = 0.5 * (target["boxes"][:, 0] + target["boxes"][:, 2]) + cy = 0.5 * (target["boxes"][:, 1] + target["boxes"][:, 3]) + is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom) + if not is_within_crop_area.any(): + continue + + # check at least 1 box with jaccard limitations + boxes = target["boxes"][is_within_crop_area] + ious = torchvision.ops.boxes.box_iou(boxes, torch.tensor([[left, top, right, bottom]], + dtype=boxes.dtype, device=boxes.device)) + if ious.max() < min_jaccard_overlap: + continue + + # keep only valid boxes and perform cropping + target["boxes"] = boxes + target["labels"] = target["labels"][is_within_crop_area] + target["boxes"][:, 0::2] -= left + target["boxes"][:, 1::2] -= top + target["boxes"][:, 0::2].clamp_(min=0, max=new_w) + target["boxes"][:, 1::2].clamp_(min=0, max=new_h) + img = F.crop(img, top, left, new_h, new_w) + + return img, target From ebb7f903e6802d2b80ff84a4e32b85dec56a01a5 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 26 Apr 2021 20:55:30 +0100 Subject: [PATCH 67/92] Adding necessary checks to our tranforms. --- torchvision/models/detection/transform.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/torchvision/models/detection/transform.py b/torchvision/models/detection/transform.py index 6d404372ce1..836491a6f51 100644 --- a/torchvision/models/detection/transform.py +++ b/torchvision/models/detection/transform.py @@ -319,7 +319,15 @@ def __init__(self, min_scale: float = 0.3, max_scale: float = 1.0, min_aspect_ra def forward(self, img: Tensor, target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: - assert target is not None + if target is None: + raise ValueError("The targets can't be None for this transform.") + + if isinstance(img, torch.Tensor): + if img.ndimension() not in {2, 3}: + raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(img.ndimension())) + elif img.ndimension() == 2: + img = img.unsqueeze(0) + orig_w, orig_h = F._get_image_size(img) while True: From 92552de3dd470642bd7394239ea22988667021a8 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Tue, 27 Apr 2021 13:30:20 +0100 Subject: [PATCH 68/92] Adding RandomZoomOut. --- torchvision/models/detection/transform.py | 51 +++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/torchvision/models/detection/transform.py b/torchvision/models/detection/transform.py index 836491a6f51..afb0f56de6f 100644 --- a/torchvision/models/detection/transform.py +++ b/torchvision/models/detection/transform.py @@ -379,3 +379,54 @@ def forward(self, img: Tensor, img = F.crop(img, top, left, new_h, new_w) return img, target + + +class RandomZoomOut(nn.Module): + def __init__(self, fill: Optional[List[int]] = None, side_range: Tuple[float, float] = (1., 4.), p=0.5): + super().__init__() + if fill is None: + fill = [0, 0, 0] + self.fill = fill + self.side_range = side_range + if side_range[0] < 1. or side_range[0] > side_range[1]: + raise ValueError("Invalid canvas side range provided {}.".format(side_range)) + self.p = p + + @torch.jit.unused + def _get_fill_value(self, is_pil): + # type: (bool) -> int + # We fake the type to make it work on JIT + return tuple(self.fill) if is_pil else 0 + + def forward(self, img: Tensor, + target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if torch.rand(1) < self.p: + return img, target + + orig_w, orig_h = F._get_image_size(img) + + r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0]) + canvas_width = int(orig_w * r) + canvas_height = int(orig_h * r) + + r = torch.rand(2) + left = int((canvas_width - orig_w) * r[0]) + top = int((canvas_height - orig_h) * r[1]) + right = canvas_width - (left + orig_w) + bottom = canvas_height - (top + orig_h) + + if torch.jit.is_scripting(): + fill = 0 + else: + fill = self._get_fill_value(F._is_pil_image(img)) + + img = F.pad(img, [left, top, right, bottom], fill=fill) + if isinstance(img, torch.Tensor): + v = torch.tensor(self.fill, device=img.device, dtype=img.dtype).view(-1, 1, 1) + img[..., :top, :] = img[..., :, :left] = img[..., (top + orig_h):, :] = img[..., :, (left + orig_w):] = v + + if target is not None: + target["boxes"][:, 0::2] += left + target["boxes"][:, 1::2] += top + + return img, target From 9b4b2ce0439a41bf6d2eb21cf6df45f480cacc98 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Tue, 27 Apr 2021 14:59:35 +0100 Subject: [PATCH 69/92] Adding RandomPhotometricDistort. --- torchvision/models/detection/transform.py | 62 ++++++++++++++++++++++- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/torchvision/models/detection/transform.py b/torchvision/models/detection/transform.py index afb0f56de6f..8f805b006bc 100644 --- a/torchvision/models/detection/transform.py +++ b/torchvision/models/detection/transform.py @@ -382,7 +382,7 @@ def forward(self, img: Tensor, class RandomZoomOut(nn.Module): - def __init__(self, fill: Optional[List[int]] = None, side_range: Tuple[float, float] = (1., 4.), p=0.5): + def __init__(self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1., 4.), p: float = 0.5): super().__init__() if fill is None: fill = [0, 0, 0] @@ -396,10 +396,16 @@ def __init__(self, fill: Optional[List[int]] = None, side_range: Tuple[float, fl def _get_fill_value(self, is_pil): # type: (bool) -> int # We fake the type to make it work on JIT - return tuple(self.fill) if is_pil else 0 + return tuple(int(x) for x in self.fill) if is_pil else 0 def forward(self, img: Tensor, target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if isinstance(img, torch.Tensor): + if img.ndimension() not in {2, 3}: + raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(img.ndimension())) + elif img.ndimension() == 2: + img = img.unsqueeze(0) + if torch.rand(1) < self.p: return img, target @@ -430,3 +436,55 @@ def forward(self, img: Tensor, target["boxes"][:, 1::2] += top return img, target + + +class RandomPhotometricDistort(nn.Module): + def __init__(self, contrast: Tuple[float] = (0.5, 1.5), saturation: Tuple[float] = (0.5, 1.5), + hue: Tuple[float] = (-0.05, 0.05), brightness: Tuple[float] = (0.875 , 1.125), p: float = 0.5): + super().__init__() + self._brightness = T.ColorJitter(brightness=brightness) + self._contrast = T.ColorJitter(contrast=contrast) + self._hue = T.ColorJitter(hue=hue) + self._saturation = T.ColorJitter(saturation=saturation) + self.p = p + + def forward(self, img: Tensor, + target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if isinstance(img, torch.Tensor): + if img.ndimension() not in {2, 3}: + raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(img.ndimension())) + elif img.ndimension() == 2: + img = img.unsqueeze(0) + + r = torch.rand(7) + + if r[0] < self.p: + img = self._brightness(img) + + contrast_before = r[1] < 0.5 + if contrast_before: + if r[2] < self.p: + img = self._contrast(img) + + if r[3] < self.p: + img = self._saturation(img) + + if r[4] < self.p: + img = self._hue(img) + + if not contrast_before: + if r[5] < self.p: + img = self._contrast(img) + + if r[6] < self.p: + channels = F._get_image_num_channels(img) + permutation = torch.randperm(channels) + + is_pil = F._is_pil_image(img) + if is_pil: + img = F.to_tensor(img) + img = img[..., permutation, :, :] + if is_pil: + img = F.to_pil_image(img) + + return img, target From 6f0a61eedb7e12edea11d74a32e0e61d6229bd0d Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Tue, 27 Apr 2021 15:18:33 +0100 Subject: [PATCH 70/92] Moving Detection transforms to references. --- references/detection/transforms.py | 238 +++++++++++++++++++--- torchvision/models/detection/transform.py | 200 ------------------ 2 files changed, 215 insertions(+), 223 deletions(-) diff --git a/references/detection/transforms.py b/references/detection/transforms.py index 937ae3c07fc..cb666457db5 100644 --- a/references/detection/transforms.py +++ b/references/detection/transforms.py @@ -1,6 +1,10 @@ -import random +import torch +import torchvision +from torch import nn, Tensor from torchvision.transforms import functional as F +from torchvision.transforms import transforms as T +from typing import List, Tuple, Dict, Optional def _flip_coco_person_keypoints(kps, width): @@ -13,37 +17,225 @@ def _flip_coco_person_keypoints(kps, width): return flipped_data -class Compose(object): +class Compose(nn.Module): def __init__(self, transforms): - self.transforms = transforms + super().__init__() + self.transforms = nn.ModuleList(transforms) - def __call__(self, image, target): + def forward(self, image: Tensor, + target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: for t in self.transforms: image, target = t(image, target) return image, target -class RandomHorizontalFlip(object): - def __init__(self, prob): - self.prob = prob - - def __call__(self, image, target): - if random.random() < self.prob: - height, width = image.shape[-2:] - image = image.flip(-1) - bbox = target["boxes"] - bbox[:, [0, 2]] = width - bbox[:, [2, 0]] - target["boxes"] = bbox - if "masks" in target: - target["masks"] = target["masks"].flip(-1) - if "keypoints" in target: - keypoints = target["keypoints"] - keypoints = _flip_coco_person_keypoints(keypoints, width) - target["keypoints"] = keypoints +class RandomHorizontalFlip(T.RandomHorizontalFlip): + def forward(self, image: Tensor, + target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if torch.rand(1) < self.p: + image = F.hflip(image) + if target is not None: + width, _ = F._get_image_size(image) + target["boxes"][:, [0, 2]] = width - target["boxes"][:, [2, 0]] + if "masks" in target: + target["masks"] = target["masks"].flip(-1) + if "keypoints" in target: + keypoints = target["keypoints"] + keypoints = _flip_coco_person_keypoints(keypoints, width) + target["keypoints"] = keypoints return image, target -class ToTensor(object): - def __call__(self, image, target): +class ToTensor(nn.Module): + def forward(self, image: Tensor, + target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: image = F.to_tensor(image) return image, target + + +class RandomIoUCrop(nn.Module): + def __init__(self, min_scale: float = 0.3, max_scale: float = 1.0, min_aspect_ratio: float = 0.5, + max_aspect_ratio: float = 2.0, sampler_options: Optional[List[float]] = None, trials: int = 40): + super().__init__() + # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174 + self.min_scale = min_scale + self.max_scale = max_scale + self.min_aspect_ratio = min_aspect_ratio + self.max_aspect_ratio = max_aspect_ratio + if sampler_options is None: + sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0] + self.options = sampler_options + self.trials = trials + + def forward(self, image: Tensor, + target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if target is None: + raise ValueError("The targets can't be None for this transform.") + + if isinstance(image, torch.Tensor): + if image.ndimension() not in {2, 3}: + raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension())) + elif image.ndimension() == 2: + image = image.unsqueeze(0) + + orig_w, orig_h = F._get_image_size(image) + + while True: + # sample an option + idx = int(torch.randint(low=0, high=len(self.options), size=(1,))) + min_jaccard_overlap = self.options[idx] + if min_jaccard_overlap >= 1.0: # a value larger than 1 encodes the leave as-is option + return image, target + + for _ in range(self.trials): + # check the aspect ratio limitations + r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2) + new_w = int(orig_w * r[0]) + new_h = int(orig_h * r[1]) + aspect_ratio = new_w / new_h + if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio): + continue + + # check for 0 area crops + r = torch.rand(2) + left = int((orig_w - new_w) * r[0]) + top = int((orig_h - new_h) * r[1]) + right = left + new_w + bottom = top + new_h + if left == right or top == bottom: + continue + + # check for any valid boxes with centers within the crop area + cx = 0.5 * (target["boxes"][:, 0] + target["boxes"][:, 2]) + cy = 0.5 * (target["boxes"][:, 1] + target["boxes"][:, 3]) + is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom) + if not is_within_crop_area.any(): + continue + + # check at least 1 box with jaccard limitations + boxes = target["boxes"][is_within_crop_area] + ious = torchvision.ops.boxes.box_iou(boxes, torch.tensor([[left, top, right, bottom]], + dtype=boxes.dtype, device=boxes.device)) + if ious.max() < min_jaccard_overlap: + continue + + # keep only valid boxes and perform cropping + target["boxes"] = boxes + target["labels"] = target["labels"][is_within_crop_area] + target["boxes"][:, 0::2] -= left + target["boxes"][:, 1::2] -= top + target["boxes"][:, 0::2].clamp_(min=0, max=new_w) + target["boxes"][:, 1::2].clamp_(min=0, max=new_h) + image = F.crop(image, top, left, new_h, new_w) + + return image, target + + +class RandomZoomOut(nn.Module): + def __init__(self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1., 4.), p: float = 0.5): + super().__init__() + if fill is None: + fill = [0, 0, 0] + self.fill = fill + self.side_range = side_range + if side_range[0] < 1. or side_range[0] > side_range[1]: + raise ValueError("Invalid canvas side range provided {}.".format(side_range)) + self.p = p + + @torch.jit.unused + def _get_fill_value(self, is_pil): + # type: (bool) -> int + # We fake the type to make it work on JIT + return tuple(int(x) for x in self.fill) if is_pil else 0 + + def forward(self, image: Tensor, + target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if isinstance(image, torch.Tensor): + if image.ndimension() not in {2, 3}: + raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension())) + elif image.ndimension() == 2: + image = image.unsqueeze(0) + + if torch.rand(1) < self.p: + return image, target + + orig_w, orig_h = F._get_image_size(image) + + r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0]) + canvas_width = int(orig_w * r) + canvas_height = int(orig_h * r) + + r = torch.rand(2) + left = int((canvas_width - orig_w) * r[0]) + top = int((canvas_height - orig_h) * r[1]) + right = canvas_width - (left + orig_w) + bottom = canvas_height - (top + orig_h) + + if torch.jit.is_scripting(): + fill = 0 + else: + fill = self._get_fill_value(F._is_pil_image(image)) + + image = F.pad(image, [left, top, right, bottom], fill=fill) + if isinstance(image, torch.Tensor): + v = torch.tensor(self.fill, device=image.device, dtype=image.dtype).view(-1, 1, 1) + image[..., :top, :] = image[..., :, :left] = image[..., (top + orig_h):, :] = \ + image[..., :, (left + orig_w):] = v + + if target is not None: + target["boxes"][:, 0::2] += left + target["boxes"][:, 1::2] += top + + return image, target + + +class RandomPhotometricDistort(nn.Module): + def __init__(self, contrast: Tuple[float] = (0.5, 1.5), saturation: Tuple[float] = (0.5, 1.5), + hue: Tuple[float] = (-0.05, 0.05), brightness: Tuple[float] = (0.875 , 1.125), p: float = 0.5): + super().__init__() + self._brightness = T.ColorJitter(brightness=brightness) + self._contrast = T.ColorJitter(contrast=contrast) + self._hue = T.ColorJitter(hue=hue) + self._saturation = T.ColorJitter(saturation=saturation) + self.p = p + + def forward(self, image: Tensor, + target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if isinstance(image, torch.Tensor): + if image.ndimension() not in {2, 3}: + raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension())) + elif image.ndimension() == 2: + image = image.unsqueeze(0) + + r = torch.rand(7) + + if r[0] < self.p: + image = self._brightness(image) + + contrast_before = r[1] < 0.5 + if contrast_before: + if r[2] < self.p: + image = self._contrast(image) + + if r[3] < self.p: + image = self._saturation(image) + + if r[4] < self.p: + image = self._hue(image) + + if not contrast_before: + if r[5] < self.p: + image = self._contrast(image) + + if r[6] < self.p: + channels = F._get_image_num_channels(image) + permutation = torch.randperm(channels) + + is_pil = F._is_pil_image(image) + if is_pil: + image = F.to_tensor(image) + image = image[..., permutation, :, :] + if is_pil: + image = F.to_pil_image(image) + + return image, target diff --git a/torchvision/models/detection/transform.py b/torchvision/models/detection/transform.py index 8f805b006bc..55cfb483847 100644 --- a/torchvision/models/detection/transform.py +++ b/torchvision/models/detection/transform.py @@ -3,8 +3,6 @@ import torchvision from torch import nn, Tensor -from torchvision.transforms import functional as F -from torchvision.transforms import transforms as T from typing import List, Tuple, Dict, Optional from .image_list import ImageList @@ -290,201 +288,3 @@ def resize_boxes(boxes, original_size, new_size): ymin = ymin * ratio_height ymax = ymax * ratio_height return torch.stack((xmin, ymin, xmax, ymax), dim=1) - - -class RandomHorizontalFlip(T.RandomHorizontalFlip): - def forward(self, img: Tensor, - target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: - if torch.rand(1) < self.p: - img = F.hflip(img) - if target is not None: - width, _ = F._get_image_size(img) - target["boxes"][:, [0, 2]] = width - target["boxes"][:, [2, 0]] - return img, target - - -class RandomIoUCrop(nn.Module): - def __init__(self, min_scale: float = 0.3, max_scale: float = 1.0, min_aspect_ratio: float = 0.5, - max_aspect_ratio: float = 2.0, sampler_options: Optional[List[float]] = None, trials: int = 40): - super().__init__() - # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174 - self.min_scale = min_scale - self.max_scale = max_scale - self.min_aspect_ratio = min_aspect_ratio - self.max_aspect_ratio = max_aspect_ratio - if sampler_options is None: - sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0] - self.options = sampler_options - self.trials = trials - - def forward(self, img: Tensor, - target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: - if target is None: - raise ValueError("The targets can't be None for this transform.") - - if isinstance(img, torch.Tensor): - if img.ndimension() not in {2, 3}: - raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(img.ndimension())) - elif img.ndimension() == 2: - img = img.unsqueeze(0) - - orig_w, orig_h = F._get_image_size(img) - - while True: - # sample an option - idx = int(torch.randint(low=0, high=len(self.options), size=(1,))) - min_jaccard_overlap = self.options[idx] - if min_jaccard_overlap >= 1.0: # a value larger than 1 encodes the leave as-is option - return img, target - - for _ in range(self.trials): - # check the aspect ratio limitations - r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2) - new_w = int(orig_w * r[0]) - new_h = int(orig_h * r[1]) - aspect_ratio = new_w / new_h - if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio): - continue - - # check for 0 area crops - r = torch.rand(2) - left = int((orig_w - new_w) * r[0]) - top = int((orig_h - new_h) * r[1]) - right = left + new_w - bottom = top + new_h - if left == right or top == bottom: - continue - - # check for any valid boxes with centers within the crop area - cx = 0.5 * (target["boxes"][:, 0] + target["boxes"][:, 2]) - cy = 0.5 * (target["boxes"][:, 1] + target["boxes"][:, 3]) - is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom) - if not is_within_crop_area.any(): - continue - - # check at least 1 box with jaccard limitations - boxes = target["boxes"][is_within_crop_area] - ious = torchvision.ops.boxes.box_iou(boxes, torch.tensor([[left, top, right, bottom]], - dtype=boxes.dtype, device=boxes.device)) - if ious.max() < min_jaccard_overlap: - continue - - # keep only valid boxes and perform cropping - target["boxes"] = boxes - target["labels"] = target["labels"][is_within_crop_area] - target["boxes"][:, 0::2] -= left - target["boxes"][:, 1::2] -= top - target["boxes"][:, 0::2].clamp_(min=0, max=new_w) - target["boxes"][:, 1::2].clamp_(min=0, max=new_h) - img = F.crop(img, top, left, new_h, new_w) - - return img, target - - -class RandomZoomOut(nn.Module): - def __init__(self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1., 4.), p: float = 0.5): - super().__init__() - if fill is None: - fill = [0, 0, 0] - self.fill = fill - self.side_range = side_range - if side_range[0] < 1. or side_range[0] > side_range[1]: - raise ValueError("Invalid canvas side range provided {}.".format(side_range)) - self.p = p - - @torch.jit.unused - def _get_fill_value(self, is_pil): - # type: (bool) -> int - # We fake the type to make it work on JIT - return tuple(int(x) for x in self.fill) if is_pil else 0 - - def forward(self, img: Tensor, - target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: - if isinstance(img, torch.Tensor): - if img.ndimension() not in {2, 3}: - raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(img.ndimension())) - elif img.ndimension() == 2: - img = img.unsqueeze(0) - - if torch.rand(1) < self.p: - return img, target - - orig_w, orig_h = F._get_image_size(img) - - r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0]) - canvas_width = int(orig_w * r) - canvas_height = int(orig_h * r) - - r = torch.rand(2) - left = int((canvas_width - orig_w) * r[0]) - top = int((canvas_height - orig_h) * r[1]) - right = canvas_width - (left + orig_w) - bottom = canvas_height - (top + orig_h) - - if torch.jit.is_scripting(): - fill = 0 - else: - fill = self._get_fill_value(F._is_pil_image(img)) - - img = F.pad(img, [left, top, right, bottom], fill=fill) - if isinstance(img, torch.Tensor): - v = torch.tensor(self.fill, device=img.device, dtype=img.dtype).view(-1, 1, 1) - img[..., :top, :] = img[..., :, :left] = img[..., (top + orig_h):, :] = img[..., :, (left + orig_w):] = v - - if target is not None: - target["boxes"][:, 0::2] += left - target["boxes"][:, 1::2] += top - - return img, target - - -class RandomPhotometricDistort(nn.Module): - def __init__(self, contrast: Tuple[float] = (0.5, 1.5), saturation: Tuple[float] = (0.5, 1.5), - hue: Tuple[float] = (-0.05, 0.05), brightness: Tuple[float] = (0.875 , 1.125), p: float = 0.5): - super().__init__() - self._brightness = T.ColorJitter(brightness=brightness) - self._contrast = T.ColorJitter(contrast=contrast) - self._hue = T.ColorJitter(hue=hue) - self._saturation = T.ColorJitter(saturation=saturation) - self.p = p - - def forward(self, img: Tensor, - target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: - if isinstance(img, torch.Tensor): - if img.ndimension() not in {2, 3}: - raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(img.ndimension())) - elif img.ndimension() == 2: - img = img.unsqueeze(0) - - r = torch.rand(7) - - if r[0] < self.p: - img = self._brightness(img) - - contrast_before = r[1] < 0.5 - if contrast_before: - if r[2] < self.p: - img = self._contrast(img) - - if r[3] < self.p: - img = self._saturation(img) - - if r[4] < self.p: - img = self._hue(img) - - if not contrast_before: - if r[5] < self.p: - img = self._contrast(img) - - if r[6] < self.p: - channels = F._get_image_num_channels(img) - permutation = torch.randperm(channels) - - is_pil = F._is_pil_image(img) - if is_pil: - img = F.to_tensor(img) - img = img[..., permutation, :, :] - if is_pil: - img = F.to_pil_image(img) - - return img, target From 6ce9bd4acfc2156d9cf38f4c803c30a48a38fc07 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Tue, 27 Apr 2021 15:48:22 +0100 Subject: [PATCH 71/92] Update presets --- references/detection/presets.py | 24 ++++++++++++++++-------- references/detection/train.py | 15 +++++++++++---- references/detection/transforms.py | 8 +++++++- 3 files changed, 34 insertions(+), 13 deletions(-) diff --git a/references/detection/presets.py b/references/detection/presets.py index b0c86ed1265..81803fe9ac4 100644 --- a/references/detection/presets.py +++ b/references/detection/presets.py @@ -2,20 +2,28 @@ class DetectionPresetTrain: - def __init__(self, hflip_prob=0.5): - trans = [T.ToTensor()] - if hflip_prob > 0: - trans.append(T.RandomHorizontalFlip(hflip_prob)) - - self.transforms = T.Compose(trans) + def __init__(self, hflip_prob=0.5, ssd_augmentation=False, mean=(123., 117., 104.), scaling=True): + if ssd_augmentation: + self.transforms = T.Compose([ + T.RandomPhotometricDistort(), + T.RandomZoomOut(fill=list(mean)), + T.RandomIoUCrop(), + T.RandomHorizontalFlip(p=hflip_prob), + T.ToTensor(scaling=scaling), + ]) + else: + self.transforms = T.Compose([ + T.RandomHorizontalFlip(p=hflip_prob), + T.ToTensor(), + ]) def __call__(self, img, target): return self.transforms(img, target) class DetectionPresetEval: - def __init__(self): - self.transforms = T.ToTensor() + def __init__(self, scaling=True): + self.transforms = T.ToTensor(scaling=scaling) def __call__(self, img, target): return self.transforms(img, target) diff --git a/references/detection/train.py b/references/detection/train.py index 491c4c295b7..8b3fdf8ca6a 100644 --- a/references/detection/train.py +++ b/references/detection/train.py @@ -47,8 +47,15 @@ def get_dataset(name, image_set, transform, data_path): return ds, num_classes -def get_transform(train): - return presets.DetectionPresetTrain() if train else presets.DetectionPresetEval() +def get_transform(train, args): + if "ssd" in args.model: + ssd_augmentation = True + scaling = False + else: + ssd_augmentation = False + scaling = True + return presets.DetectionPresetTrain(ssd_augmentation=ssd_augmentation, scaling=scaling) if train \ + else presets.DetectionPresetEval(scaling=scaling) def main(args): @@ -60,8 +67,8 @@ def main(args): # Data loading code print("Loading data") - dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path) - dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path) + dataset, num_classes = get_dataset(args.dataset, "train", get_transform(True, args), args.data_path) + dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args), args.data_path) print("Creating data loaders") if args.distributed: diff --git a/references/detection/transforms.py b/references/detection/transforms.py index cb666457db5..a67a426615a 100644 --- a/references/detection/transforms.py +++ b/references/detection/transforms.py @@ -47,9 +47,15 @@ def forward(self, image: Tensor, class ToTensor(nn.Module): + def __init__(self, scaling: bool = True): + super().__init__() + self.scaling = scaling + def forward(self, image: Tensor, target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: image = F.to_tensor(image) + if not self.scaling: + image *= 255 return image, target @@ -135,7 +141,7 @@ class RandomZoomOut(nn.Module): def __init__(self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1., 4.), p: float = 0.5): super().__init__() if fill is None: - fill = [0, 0, 0] + fill = [0., 0., 0.] self.fill = fill self.side_range = side_range if side_range[0] < 1. or side_range[0] > side_range[1]: From 60c6f72dd43f77ef8b679468e7dc68fe787e00a0 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Tue, 27 Apr 2021 15:57:15 +0100 Subject: [PATCH 72/92] fix lint --- references/detection/transforms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/references/detection/transforms.py b/references/detection/transforms.py index a67a426615a..a4205c32617 100644 --- a/references/detection/transforms.py +++ b/references/detection/transforms.py @@ -197,7 +197,7 @@ def forward(self, image: Tensor, class RandomPhotometricDistort(nn.Module): def __init__(self, contrast: Tuple[float] = (0.5, 1.5), saturation: Tuple[float] = (0.5, 1.5), - hue: Tuple[float] = (-0.05, 0.05), brightness: Tuple[float] = (0.875 , 1.125), p: float = 0.5): + hue: Tuple[float] = (-0.05, 0.05), brightness: Tuple[float] = (0.875, 1.125), p: float = 0.5): super().__init__() self._brightness = T.ColorJitter(brightness=brightness) self._contrast = T.ColorJitter(contrast=contrast) From ff83c2df06849fb07c9f8074121c25e80b1f1b82 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Tue, 27 Apr 2021 16:06:51 +0100 Subject: [PATCH 73/92] leave compose and object --- references/detection/transforms.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/references/detection/transforms.py b/references/detection/transforms.py index a4205c32617..c0bd5bf3ac2 100644 --- a/references/detection/transforms.py +++ b/references/detection/transforms.py @@ -17,13 +17,12 @@ def _flip_coco_person_keypoints(kps, width): return flipped_data -class Compose(nn.Module): +class Compose(object): def __init__(self, transforms): super().__init__() - self.transforms = nn.ModuleList(transforms) + self.transforms = transforms - def forward(self, image: Tensor, - target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + def __call__(self, image, target): for t in self.transforms: image, target = t(image, target) return image, target From 2423a2a28baa65bbc3998c8a9c4fa6f43d804bee Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Tue, 27 Apr 2021 17:14:19 +0100 Subject: [PATCH 74/92] Adding scaling for completeness. --- references/detection/presets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/references/detection/presets.py b/references/detection/presets.py index 81803fe9ac4..a22e1b2b616 100644 --- a/references/detection/presets.py +++ b/references/detection/presets.py @@ -14,7 +14,7 @@ def __init__(self, hflip_prob=0.5, ssd_augmentation=False, mean=(123., 117., 104 else: self.transforms = T.Compose([ T.RandomHorizontalFlip(p=hflip_prob), - T.ToTensor(), + T.ToTensor(scaling=scaling), ]) def __call__(self, img, target): From 017c63441e55d90bef39128bbe69bfc2c9bc8f60 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Tue, 27 Apr 2021 18:13:13 +0100 Subject: [PATCH 75/92] Adding params in the repr --- torchvision/models/detection/anchor_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index 5402d510459..9cae20ad584 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -202,7 +202,9 @@ def __init__(self, aspect_ratios: List[List[int]], min_ratio: float = 0.15, max_ def __repr__(self) -> str: s = self.__class__.__name__ + '(' s += 'aspect_ratios={aspect_ratios}' + s += ', clip={clip}' s += ', scales={scales}' + s += ', steps={steps}' s += ')' return s.format(**self.__dict__) From 3669795953919f5f915befb0c811cf356cdf514a Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Tue, 27 Apr 2021 21:33:27 +0100 Subject: [PATCH 76/92] Remove unnecessary import. --- torchvision/models/detection/anchor_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index 9cae20ad584..cf365b223ce 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -1,5 +1,4 @@ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -import itertools import math import torch from torch import nn, Tensor From 0f581d39936c149798a9bc783b4457b8f0614949 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 28 Apr 2021 11:00:30 +0100 Subject: [PATCH 77/92] minor refactoring --- torchvision/models/detection/ssd.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index a3b95697f72..723c85e696e 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -27,13 +27,6 @@ } -def _sum(x: List[Tensor]) -> Tensor: - res = x[0] - for i in x[1:]: - res = res + i - return res - - def _xavier_init(conv: nn.Module): for layer in conv.modules(): if isinstance(layer, nn.Conv2d): @@ -206,6 +199,7 @@ def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, targets_per_image['labels'][foreground_matched_idxs_per_image] cls_targets.append(gt_classes_target) + bbox_loss = torch.stack(bbox_loss) cls_targets = torch.stack(cls_targets) # Calculate classification loss @@ -228,7 +222,7 @@ def compute_loss(self, targets: List[Dict[str, Tensor]], head_outputs: Dict[str, N = max(1, num_foreground) return { - 'bbox_regression': _sum(bbox_loss) / N, + 'bbox_regression': bbox_loss.sum() / N, 'classification': (cls_loss[foreground_idxs].sum() + cls_loss[background_idxs].sum()) / N, } From 3fb1e0bf5e361d73632b6a0333fde15a973694d2 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 28 Apr 2021 13:42:36 +0100 Subject: [PATCH 78/92] Remove unnecessary call. --- references/detection/transforms.py | 1 - 1 file changed, 1 deletion(-) diff --git a/references/detection/transforms.py b/references/detection/transforms.py index c0bd5bf3ac2..e71ffab110e 100644 --- a/references/detection/transforms.py +++ b/references/detection/transforms.py @@ -19,7 +19,6 @@ def _flip_coco_person_keypoints(kps, width): class Compose(object): def __init__(self, transforms): - super().__init__() self.transforms = transforms def __call__(self, image, target): From 10848474c5af9f5ac29478e1d4aba2d8e3340f5c Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 28 Apr 2021 17:23:37 +0100 Subject: [PATCH 79/92] Give better names to DBox* classes --- test/test_models_detection_anchor_utils.py | 6 +++--- torchvision/models/detection/_utils.py | 2 +- torchvision/models/detection/anchor_utils.py | 2 +- torchvision/models/detection/ssd.py | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/test/test_models_detection_anchor_utils.py b/test/test_models_detection_anchor_utils.py index e1abbc0bbd2..7ff95c45337 100644 --- a/test/test_models_detection_anchor_utils.py +++ b/test/test_models_detection_anchor_utils.py @@ -1,6 +1,6 @@ import torch from common_utils import TestCase -from torchvision.models.detection.anchor_utils import AnchorGenerator, DBoxGenerator +from torchvision.models.detection.anchor_utils import AnchorGenerator, DefaultBoxGenerator from torchvision.models.detection.image_list import ImageList @@ -21,9 +21,9 @@ def _init_test_anchor_generator(self): return anchor_generator - def _init_test_dbox_generator(self): + def _init_test_defaultbox_generator(self): aspect_ratios = [[2]] - dbox_generator = DBoxGenerator(aspect_ratios) + dbox_generator = DefaultBoxGenerator(aspect_ratios) return dbox_generator diff --git a/torchvision/models/detection/_utils.py b/torchvision/models/detection/_utils.py index 4144e1495a1..00616a4b7fb 100644 --- a/torchvision/models/detection/_utils.py +++ b/torchvision/models/detection/_utils.py @@ -345,7 +345,7 @@ def set_low_quality_matches_(self, matches, all_matches, match_quality_matrix): matches[pred_inds_to_update] = all_matches[pred_inds_to_update] -class DBoxMatcher(Matcher): +class SSDMatcher(Matcher): def __init__(self, threshold): super().__init__(threshold, threshold, allow_low_quality_matches=False) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index a6b72f96439..7a01364abe5 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -158,7 +158,7 @@ def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Ten return anchors -class DBoxGenerator(nn.Module): +class DefaultBoxGenerator(nn.Module): def __init__(self, aspect_ratios: List[List[int]], min_ratio: float = 0.15, max_ratio: float = 0.9, steps: Optional[List[int]] = None, clip: bool = True): diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 723c85e696e..f5d7481ba91 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional, Tuple from . import _utils as det_utils -from .anchor_utils import DBoxGenerator +from .anchor_utils import DefaultBoxGenerator from .backbone_utils import _validate_trainable_layers from .transform import GeneralizedRCNNTransform from .. import vgg @@ -111,7 +111,7 @@ class SSD(nn.Module): 'proposal_matcher': det_utils.Matcher, } - def __init__(self, backbone: nn.Module, anchor_generator: DBoxGenerator, size: Tuple[int, int], num_classes: int, + def __init__(self, backbone: nn.Module, anchor_generator: DefaultBoxGenerator, size: Tuple[int, int], num_classes: int, image_mean: Optional[List[float]] = None, image_std: Optional[List[float]] = None, score_thresh: float = 0.01, nms_thresh: float = 0.45, @@ -139,7 +139,7 @@ def __init__(self, backbone: nn.Module, anchor_generator: DBoxGenerator, size: T self.num_anchors = [2 + 2 * len(r) for r in anchor_generator.aspect_ratios] self.head = SSDHead(out_channels, self.num_anchors, num_classes) - self.proposal_matcher = det_utils.DBoxMatcher(iou_thresh) + self.proposal_matcher = det_utils.SSDMatcher(iou_thresh) if image_mean is None: image_mean = [0.485, 0.456, 0.406] @@ -474,7 +474,7 @@ def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: i pretrained_backbone = False backbone = _vgg_extractor("vgg16_features", False, progress, pretrained_backbone, trainable_backbone_layers) - anchor_generator = DBoxGenerator([[2], [2, 3], [2, 3], [2, 3], [2], [2]], steps=[8, 16, 32, 64, 100, 300]) + anchor_generator = DefaultBoxGenerator([[2], [2, 3], [2, 3], [2, 3], [2], [2]], steps=[8, 16, 32, 64, 100, 300]) model = SSD(backbone, anchor_generator, (300, 300), num_classes, image_mean=[123., 117., 104.], image_std=[1., 1., 1.], **kwargs) if pretrained: From 57140bb4d3a0c2a06416a4ce9334ae2d1fe29041 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 28 Apr 2021 17:32:49 +0100 Subject: [PATCH 80/92] Port num_anchors estimation in generator --- torchvision/models/detection/anchor_utils.py | 4 ++++ torchvision/models/detection/ssd.py | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index 7a01364abe5..06972950707 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -197,6 +197,10 @@ def __init__(self, aspect_ratios: List[List[int]], min_ratio: float = 0.15, max_ self._wh_pairs.append(wh_pairs) + def num_anchors_per_location(self): + # Estimate num of anchors based on aspect ratios: 2 default boxes + 2 * ratios of feaure map. + return [2 + 2 * len(r) for r in self.aspect_ratios] + def __repr__(self) -> str: s = self.__class__.__name__ + '(' s += 'aspect_ratios={aspect_ratios}' diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index f5d7481ba91..705131e640f 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -135,8 +135,7 @@ def __init__(self, backbone: nn.Module, anchor_generator: DefaultBoxGenerator, s self.box_coder = det_utils.BoxCoder(weights=(10., 10., 5., 5.)) - # Estimate num of anchors based on aspect ratios: 2 default boxes + 2 * ratios of feaure map. - self.num_anchors = [2 + 2 * len(r) for r in anchor_generator.aspect_ratios] + self.num_anchors = self.anchor_generator.num_anchors_per_location() self.head = SSDHead(out_channels, self.num_anchors, num_classes) self.proposal_matcher = det_utils.SSDMatcher(iou_thresh) From 8942dd094835be9d3b6cdfb18040df1e7ad77943 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 28 Apr 2021 18:02:32 +0100 Subject: [PATCH 81/92] Remove rescaling and fix presets --- references/detection/presets.py | 20 +++++++++++--------- references/detection/train.py | 16 +++++----------- references/detection/transforms.py | 6 ------ torchvision/models/detection/ssd.py | 14 ++++++++++---- 4 files changed, 26 insertions(+), 30 deletions(-) diff --git a/references/detection/presets.py b/references/detection/presets.py index a22e1b2b616..22937cf9576 100644 --- a/references/detection/presets.py +++ b/references/detection/presets.py @@ -2,28 +2,30 @@ class DetectionPresetTrain: - def __init__(self, hflip_prob=0.5, ssd_augmentation=False, mean=(123., 117., 104.), scaling=True): - if ssd_augmentation: + def __init__(self, data_augmentation, hflip_prob=0.5, mean=(123., 117., 104.)): + if data_augmentation == 'hflip': + self.transforms = T.Compose([ + T.RandomHorizontalFlip(p=hflip_prob), + T.ToTensor(), + ]) + elif data_augmentation == 'ssd': self.transforms = T.Compose([ T.RandomPhotometricDistort(), T.RandomZoomOut(fill=list(mean)), T.RandomIoUCrop(), T.RandomHorizontalFlip(p=hflip_prob), - T.ToTensor(scaling=scaling), + T.ToTensor(), ]) else: - self.transforms = T.Compose([ - T.RandomHorizontalFlip(p=hflip_prob), - T.ToTensor(scaling=scaling), - ]) + raise ValueError(f'Unknown data augmentation policy "{data_augmentation}"') def __call__(self, img, target): return self.transforms(img, target) class DetectionPresetEval: - def __init__(self, scaling=True): - self.transforms = T.ToTensor(scaling=scaling) + def __init__(self): + self.transforms = T.ToTensor() def __call__(self, img, target): return self.transforms(img, target) diff --git a/references/detection/train.py b/references/detection/train.py index 8b3fdf8ca6a..a4dd82d19a8 100644 --- a/references/detection/train.py +++ b/references/detection/train.py @@ -47,15 +47,8 @@ def get_dataset(name, image_set, transform, data_path): return ds, num_classes -def get_transform(train, args): - if "ssd" in args.model: - ssd_augmentation = True - scaling = False - else: - ssd_augmentation = False - scaling = True - return presets.DetectionPresetTrain(ssd_augmentation=ssd_augmentation, scaling=scaling) if train \ - else presets.DetectionPresetEval(scaling=scaling) +def get_transform(train, data_augmentation): + return presets.DetectionPresetTrain(data_augmentation) if train else presets.DetectionPresetEval() def main(args): @@ -67,8 +60,8 @@ def main(args): # Data loading code print("Loading data") - dataset, num_classes = get_dataset(args.dataset, "train", get_transform(True, args), args.data_path) - dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args), args.data_path) + dataset, num_classes = get_dataset(args.dataset, "train", get_transform(True, args.data_augmentation), args.data_path) + dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args.data_augmentation), args.data_path) print("Creating data loaders") if args.distributed: @@ -192,6 +185,7 @@ def main(args): parser.add_argument('--rpn-score-thresh', default=None, type=float, help='rpn score threshold for faster-rcnn') parser.add_argument('--trainable-backbone-layers', default=None, type=int, help='number of trainable layers of backbone') + parser.add_argument('--data-augmentation', default="hflip", help='data augmentation policy (default: hflip)') parser.add_argument( "--test-only", dest="test_only", diff --git a/references/detection/transforms.py b/references/detection/transforms.py index e71ffab110e..8e4b8870eaf 100644 --- a/references/detection/transforms.py +++ b/references/detection/transforms.py @@ -45,15 +45,9 @@ def forward(self, image: Tensor, class ToTensor(nn.Module): - def __init__(self, scaling: bool = True): - super().__init__() - self.scaling = scaling - def forward(self, image: Tensor, target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: image = F.to_tensor(image) - if not self.scaling: - image *= 255 return image, target diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 705131e640f..52e85c86188 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -355,7 +355,7 @@ def postprocess_detections(self, head_outputs: Dict[str, Tensor], image_anchors: class SSDFeatureExtractorVGG(nn.Module): - def __init__(self, backbone: nn.Module, highres: bool): + def __init__(self, backbone: nn.Module, highres: bool, rescaling: bool): super().__init__() _, _, maxpool3_pos, maxpool4_pos, _ = (i for i, layer in enumerate(backbone) if isinstance(layer, nn.MaxPool2d)) @@ -421,8 +421,13 @@ def __init__(self, backbone: nn.Module, highres: bool): fc, )) self.extra = extra + self.rescaling = rescaling def forward(self, x: Tensor) -> Dict[str, Tensor]: + # Undo the 0-1 scaling of toTensor. Necessary for some backbones. + if self.rescaling: + x *= 255 + # L2 regularization + Rescaling of 1st block's feature map x = self.features(x) rescaled = self.scale_weight.view(1, -1, 1, 1) * F.normalize(x) @@ -436,7 +441,8 @@ def forward(self, x: Tensor) -> Dict[str, Tensor]: return OrderedDict([(str(i), v) for i, v in enumerate(output)]) -def _vgg_extractor(backbone_name: str, highres: bool, progress: bool, pretrained: bool, trainable_layers: int): +def _vgg_extractor(backbone_name: str, highres: bool, progress: bool, pretrained: bool, trainable_layers: int, + rescaling: bool): if backbone_name in backbone_urls: # Use custom backbones more appropriate for SSD arch = backbone_name.split('_')[0] @@ -460,7 +466,7 @@ def _vgg_extractor(backbone_name: str, highres: bool, progress: bool, pretrained for parameter in b.parameters(): parameter.requires_grad_(False) - return SSDFeatureExtractorVGG(backbone, highres) + return SSDFeatureExtractorVGG(backbone, highres, rescaling) def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: int = 91, @@ -472,7 +478,7 @@ def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: i # no need to download the backbone if pretrained is set pretrained_backbone = False - backbone = _vgg_extractor("vgg16_features", False, progress, pretrained_backbone, trainable_backbone_layers) + backbone = _vgg_extractor("vgg16_features", False, progress, pretrained_backbone, trainable_backbone_layers, True) anchor_generator = DefaultBoxGenerator([[2], [2, 3], [2, 3], [2, 3], [2], [2]], steps=[8, 16, 32, 64, 100, 300]) model = SSD(backbone, anchor_generator, (300, 300), num_classes, image_mean=[123., 117., 104.], image_std=[1., 1., 1.], **kwargs) From 517c1da4e37c12d9268fc68f40d18239448032af Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 28 Apr 2021 18:36:18 +0100 Subject: [PATCH 82/92] Add the ability to pass a custom head and refactoring. --- .../ModelTester.test_ssd300_vgg16_expect.pkl | Bin 6925 -> 6925 bytes torchvision/models/detection/_utils.py | 26 ++++++++++++++++++ torchvision/models/detection/ssd.py | 25 +++++++++-------- 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl b/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl index f96b9055ce7380d0909629ca6c2d67d760da9729..2e8629d31490b42d98548bfafb99e5e6d5c30ba3 100644 GIT binary patch literal 6925 zcmeHMO-vI(6dowQ%dZuYKM@l&m?)Hj2=M?GZ8WQ~)R+_#L&~zD4Gm@5g%FLA3j~f{ zxXI0v2SYr0_QJ{FoufyOUW`9+-p;=6?6^xQXcLo8GJS8}&-dQE*`MxAMDQqnmP8*EE~yG zR+-7@v&+fU+>D_wX49FOm7qd>eSNWN=8$EU@p+wAa2F+_dT&U2>SQE1u91aW* zja?fZ9SV#Eh6A)J=3`QAX_;2v@9`P5W=icBafngxZ`F8XOrOVesu7zQcm= z1o*Bk6aJh*yQB2{a=bsHR@X$|z1EUtgg}pYE?gIBa4g(Fu7G?x;ZUu;h=i<-mwKrN zrE9?b8i3pG=Cd15d)PAv`eC(x@?EN(2@qXg} ztHYt7%lj$!qViw}*`DMX1H}#!Wyxc@v#62nntfTs)?NPBsT=XqA&6jJT`Q*bB^?(iSxOL#( z_OBa`7|%_ASbR4fMw4;>)>w=w@@~23i&>~YAs6;`%=;VZpKF8jccXGXvW=Vmuz9)p zgA9E{9=@0X%Bw){N Fe*jKD zBgCRau23^V5Em^@L4?Fvldo!R9jqaD5Xvp1;^X*gN*7IUE-y+nTPf7! zs|V}KZJb;k$GN$?k9QgG;_T+?#_t=fDwoEjiuoGT%~VDFezQ0h90QJ8A-_Mzn$xF{ zuUQlm!{c(}Vnm6&6rtu!L2R5LE_QNaj7XEOWfQ>H7V!rJ@O6UwDiVm{3FIm3Cde~N zP)q0)(bd__#nF{NFhPS;FX9iH#p%y6DB$aHGz<9pMF|Ql0}+340N;?o;zmWsafSRL zO016+`SDWXKNV#3WkKT{UA}P{#~-SMANCP`xB}j|n1MGD@l6BxBg9em{v3^dXwoz` zu@8%-nEqc~IAhj;FBZ<>Qv;Rdc$5DBEMoWr8La=e3?4ShVA~lPM4ggB{C6_=ra}ff zTp65Cl0i<43@m2Jz{64oa|X*`q>2pE-{t`4c@C)7=0H+)4n&mXKv-@Lblc^Cg zxStIR+p?i(LpD?tXM^^VY^a`-4c)V{;n9?AFr6Tm*=0jgR~9rp%mUW$S%7D=K&LJX z64uMFOR`{Dcot;N%YyA1S@5De6GolQ1odN?uzp@9EceNTzAvR9X_rFd5h+~wK?=N0 zQaH9)3KS_i&K9m7#t202uIs+^W zGQjIzI$XV#4o#=i!S_HqR0OAkY-TznxTV9q{b^uVod(OZ)4<+84gTPy!N{>`a7iZ( z#BCDz>!busk4oUuehC!qlfdW|5(q1iz~N#ElyfEEJ4^zzUyEULj~MJ)#Blk#7=D#c z|AS(XY!-vIPz-}OVt7753>zKAaDKlC@+XQwMMDJEH-*rAR0t~$2|@pW5Jqno!mMxQ zvSmV;5iOMGB80Q@DKHg+xuFp9`v}40PXSzR6aagd0Lm%_FrruheToFoKS==oF2}H+^$sE`y>@!9ZChg?^2;AG8KZxroy4YsgT~ugYpg@ zJZbnp&8ZHE_XIu~;;liD2E(GLo;rdiAsCsc>&#Pp3 z(U}Zpcja?mGDK`o2HD1BI24@>dQr*H5tt03!O5^#Ga0V#O9FHGoV_~<;x;6KM}89U zl9S+^QxfD_C&9=;NibqdB533%f}JE0o`ogC#+iw*(k~JEn#-?mCqTf(1Snsf0HHAn zpxAbaFLheX+vAeh*TN;Y#v}QaPo)_5WA?YyMPD zDa9MxE02*Md1}{NuegUQ2KA?Ql$tBmENVyP6o=wdJ1VDml&AibQa$DS_liStDWy2n zj#BEc^A&#@r+40zr}|MJ{b@XkNA*-r<7xJaNAakhQtD4BwWB;BJ=Ie`8rSw?{K38Y-W%s@{i1dD&X4-j z{!o9)_qL~b(R`_%`q4ZoE|t?ds2{D9QYxo;>i6n5#i5kqQ$Lyq&5z%{&an)p2{g6)l)g`C(WDkbX}-j z?|f;Vl=ki`jZgh-KCT;$L-DDe%4t06PjRU|jn{i!6z9{C$O9|9`7Ats}yVsl*%ep3;EZH==T{~x&VC>_8eR(-V? z-p}0$MRx1S^V!uz{dOliLC+DlzIP*A+_$rP7QG^luDLkiygF9c{!D&|vmG})h(>)IsaiW79n@cu^mcD7yRo07?n=X3uO?ykU0(*jVUrk_YFlA&NC>u?W#Bxg ze4O;J0vy(4hZ}CKAu8)~QEKarH3Cbt(vcFamS|j%H6Men$jBeB1Z?kzzh${&l3;GM7BEw!{?=w~)mXj<7wY{uo6T zp+`s+;O8ZH^5hv}$Tq@V(?_6lUOu{iHwImejo7Mxg`!7QDf!FW6NR_*alal9 zXO9x%Cc%6LqngFX^i@O2()&&%apxm)@cdYuf8zsLqq&LsJ$CNV!Tag;M0Q&fqaM4m z9ma=YP)i$omMn?!pPZbJcb^tBwj;t%k-#1eT=`ooDLUzilTOCs_S4}^{Z0>7;f>Tj z%(YpP9d>8fpvnOsvVfC_d)L*Ius2&t`vfNpN=w1X(e7w)0kJvMlXXKwQS&!PviZ_?u{sD`$KZSY<}%*)s#AcwkHzTfx_VKY-B*{%T@) zADK7t=?ui&kt%4Xla3vQ8JKJJo~ZW^$IZsxO#T`nD^Yq-vB#@$#(Ou0Zv57jskbYx zh#c?G#r8+>IO41=qx&I8orzJ@9*Q$-^hoCbPZVvq%@(HyW6GU!68+i+n>*{g$hIWh zH{~Ha;o*FoUhtGOPEA6cnZs~W^F^Zbd^eNdlU+p&))4PwWKHE%bk}}HDo?WIaagRC zc5~5Lrc1V42BXkGxfd4NTVZTe4SA#1NwUh~adzDY;#gqgwYWy1+YR65n$S|Mn zdHxpp<+%Y?B|T^G)webg(@HJ0T-{DskE|H{MjgPLX;EbC+6uJERMu9YKMQ-L7G&aM z`ERC18S}*3aNKeu4qKlt#v8F&*w3SzgxVLPq}U7pzPp0K`ZeAHTAw4`>CZ4cIw zd5g?&PkcO9N86xhStaiL@fuU_=zV92-n~FPdR~g#50~I=qhd6>G>vVX?u#{&O=Q8- zrKtDF1N&}ok>_27op0?i_n;~1$u}XjoQYVQ^o-0648TZ>eWZ9D7YE+FPdX!)VNbX! zWB=^-PfWcY9wFrUkLf5mzntM~k16(G6_y;2#U<|Zn4C+@#kk*iDWhK>6oc3QScTF= zYLI(Zb5GRgAsDw@wSmj?FP-Vkrv^EbQ^3Q6f4u%*KQd15hSE3!+(L z(0t`JvbTE?8NJvElSg!rClW52)y`)xa9)C3(nemn))3znPQY?{UVt zH+K--Y#y@zb(uso#NwOfi?GwC0*6MFlTU)GAwKxcldStmv2R(d?zyqJ zs&p%(GdFrkxG(pT#~S^y@?1IQ7pzkDzpk7#?b63BLyC#b)et6YTQ^wDTw;?3?GSBahy^Tln7F*1A$iwi%lX0c7J2o8nojqSufIZu+Nm9*GvSaETCYJZqTcUnVPmZwjmzqjr;@o;)R^ z);5vQw_)grH^}I6J~le5FuhzVJxuP{?IvoEuMjJt@_w#9XBoB)IVSQnaeK!jPUq zeE-%LvpHjt8}C9+HI?At_*|l6T!7zqjlqtQ<@ioxI>rRd!LIrtxcg8HmL_>%@`X|+ zrXrr|JQHU9nl^|TeD=j0?A|#A>&tY=mPd>5)T(6E2@c0v!$#t5nT%^~)EHmM-ldGa z-^+5;o~wtyt(}YGE{ho1lJY_F|Ar-GiBAv?Z%AhByPjq+x{7tmI_KO|^r;Hl7Y{{4 zNiNxcX%8{VyF^w@a$)2z9Ud|3yY;a)HXU1rcdmV4_`Y^_ILBfP^3PeK*ZK+^{Mr@M zStU%2j?7iqEgDA3x_ua4fj^CNm_6+-L37KaWJO#!u2;QB#OkB4-KCb4+s(vo=LWL4 zG!h@!1!3Wt#i*Xo#n%H|u&phOB!=m=IBfWn*2? zJ|`HTW+mbvi!$8jwFvi2&c)meWj{~falqeyosO4WTZvd4jc0@(h(mZDIyK&A_HWjd z8RUMeBgWn{L+`OOaA&FqDxQB__dYn3tIc95=GUKpbmhs>?-`Offl&T!z?;DTGOVniPWt1H6-OTW^bg=`=*M7+|p!AD$G0(s}a6C`Q0gC6)UrJ~gp+3{&z7wLeFlc_W}M_fps^ GZT|xrTrQaa diff --git a/torchvision/models/detection/_utils.py b/torchvision/models/detection/_utils.py index 00616a4b7fb..d2307c08389 100644 --- a/torchvision/models/detection/_utils.py +++ b/torchvision/models/detection/_utils.py @@ -378,3 +378,29 @@ def overwrite_eps(model, eps): for module in model.modules(): if isinstance(module, FrozenBatchNorm2d): module.eps = eps + + +def retrieve_out_channels(model, size): + """ + This method retrieves the number of output channels of a specific model. + + Args: + model (nn.Module): The model for which we estimate the out_channels. + size (Tuple[int, int]): The size (wxh) of the input. + + Returns: + out_channels (List[int]): A list of the output channels of the model. + """ + in_training = model.training + model.eval() + + with torch.no_grad(): + # Use dummy data to retrieve the feature map sizes to avoid hard-coding their values + device = next(model.parameters()).device + tmp_img = torch.zeros((1, 3, size[1], size[0]), device=device) + out_channels = [x.size(1) for x in model(tmp_img).values()] + + if in_training: + model.train() + + return out_channels diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 52e85c86188..527dd2aa4b5 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -111,8 +111,10 @@ class SSD(nn.Module): 'proposal_matcher': det_utils.Matcher, } - def __init__(self, backbone: nn.Module, anchor_generator: DefaultBoxGenerator, size: Tuple[int, int], num_classes: int, + def __init__(self, backbone: nn.Module, anchor_generator: DefaultBoxGenerator, + size: Tuple[int, int], num_classes: int, image_mean: Optional[List[float]] = None, image_std: Optional[List[float]] = None, + head: Optional[nn.Module] = None, score_thresh: float = 0.01, nms_thresh: float = 0.45, detections_per_img: int = 200, @@ -121,22 +123,23 @@ def __init__(self, backbone: nn.Module, anchor_generator: DefaultBoxGenerator, s positive_fraction: float = 0.25): super().__init__() - # Use dummy data to retrieve the feature map sizes to avoid hard-coding their values - device = next(backbone.parameters()).device - tmp_img = torch.zeros((1, 3, size[1], size[0]), device=device) - tmp_sizes = [x.size() for x in backbone(tmp_img).values()] - out_channels = [x[1] for x in tmp_sizes] - - assert len(out_channels) == len(anchor_generator.aspect_ratios) - self.backbone = backbone self.anchor_generator = anchor_generator self.box_coder = det_utils.BoxCoder(weights=(10., 10., 5., 5.)) - self.num_anchors = self.anchor_generator.num_anchors_per_location() - self.head = SSDHead(out_channels, self.num_anchors, num_classes) + if head is None: + if hasattr(backbone, 'out_channels'): + out_channels = backbone.out_channels + else: + out_channels = det_utils.retrieve_out_channels(backbone, size) + + assert len(out_channels) == len(anchor_generator.aspect_ratios) + + num_anchors = self.anchor_generator.num_anchors_per_location() + head = SSDHead(out_channels, num_anchors, num_classes) + self.head = head self.proposal_matcher = det_utils.SSDMatcher(iou_thresh) From 2deb51e08b6b0538666bb4af79aeaacd8f653647 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 28 Apr 2021 18:40:25 +0100 Subject: [PATCH 83/92] fix lint --- references/detection/train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/references/detection/train.py b/references/detection/train.py index a4dd82d19a8..26708ebb21e 100644 --- a/references/detection/train.py +++ b/references/detection/train.py @@ -60,7 +60,8 @@ def main(args): # Data loading code print("Loading data") - dataset, num_classes = get_dataset(args.dataset, "train", get_transform(True, args.data_augmentation), args.data_path) + dataset, num_classes = get_dataset(args.dataset, "train", get_transform(True, args.data_augmentation), + args.data_path) dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args.data_augmentation), args.data_path) print("Creating data loaders") From 02a2af5e76eb3582d70ab541f3e9ce09e326d50f Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 28 Apr 2021 18:43:33 +0100 Subject: [PATCH 84/92] Fix unit-test --- test/test_models_detection_anchor_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_models_detection_anchor_utils.py b/test/test_models_detection_anchor_utils.py index 7ff95c45337..a1937fb01cc 100644 --- a/test/test_models_detection_anchor_utils.py +++ b/test/test_models_detection_anchor_utils.py @@ -65,13 +65,13 @@ def test_anchor_generator(self): self.assertEqual(anchors[0], anchors_output) self.assertEqual(anchors[1], anchors_output) - def test_dbox_generator(self): + def test_defaultbox_generator(self): images = torch.zeros(2, 3, 15, 15) features = [torch.zeros(2, 8, 1, 1)] image_shapes = [i.shape[-2:] for i in images] images = ImageList(images, image_shapes) - model = self._init_test_dbox_generator() + model = self._init_test_defaultbox_generator() model.eval() dboxes = model(images, features) From 2befe43bd32186a6f073ff19a7128d3eb79e1d2d Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 28 Apr 2021 19:24:12 +0100 Subject: [PATCH 85/92] Update todos. --- torchvision/models/detection/anchor_utils.py | 1 + torchvision/models/detection/ssd.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index 06972950707..841b72e40fd 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -162,6 +162,7 @@ class DefaultBoxGenerator(nn.Module): def __init__(self, aspect_ratios: List[List[int]], min_ratio: float = 0.15, max_ratio: float = 0.9, steps: Optional[List[int]] = None, clip: bool = True): + # TODO: Add documentation super().__init__() if steps is not None: assert len(aspect_ratios) == len(steps) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 527dd2aa4b5..ae8a1ff7370 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -17,7 +17,7 @@ __all__ = ['SSD', 'ssd300_vgg16'] model_urls = { - 'ssd300_vgg16_coco': None, # TODO: Add url with weights + 'ssd300_vgg16_coco': None, # TODO: Add url with weights + add the model on the documentation and references readme } backbone_urls = { @@ -121,6 +121,7 @@ def __init__(self, backbone: nn.Module, anchor_generator: DefaultBoxGenerator, iou_thresh: float = 0.5, topk_candidates: int = 400, positive_fraction: float = 0.25): + # TODO: Add documentation super().__init__() self.backbone = backbone From a167edc8faeeae1e0f765c725b70c73d9d160d5a Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 28 Apr 2021 21:10:29 +0100 Subject: [PATCH 86/92] Change mean values. --- .../ModelTester.test_ssd300_vgg16_expect.pkl | Bin 6925 -> 6925 bytes torchvision/models/detection/ssd.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl b/test/expect/ModelTester.test_ssd300_vgg16_expect.pkl index 2e8629d31490b42d98548bfafb99e5e6d5c30ba3..0bfa91467d36228a5679c02e9441d3dcdeb4def6 100644 GIT binary patch literal 6925 zcma)B2Urxz@*gCKOE#ba0-~oVhzf}DY_>}=EFxk;1eGLsBDl+nsJqh`K#~$fFrac6 z&nzk`hEt#E#Qg3+(K|iD>3K$;;m&mbo?hFq?|I*UzOTO8>ZSjF6&*2aT3pm*b##1Uf@*S-r%Y++>KfdXCM43x@R(?&kuF8%7ncyO z9<3XcDUE}zXiQJ<9-clvy}Ww+^zPl)%SUMvY)D5VXUX-ZjoWMo2ITy&&*T0(rHY@{kGS`{5NXj-INrZnpkq%>D6ErOJm!6p)c$OILo zY!yox#Tv!xSM>4j>+P+yj%}(qr&ijGQZ!SvOjR~l$WoOp(qkp8mTG0IAf+wCiVKg3 zj!RUw)?vNx$Uh-I=0ihmK5wX(SFc{ZJe6&A@OJOv?In1J3ipeLo+!(2L_X z*E~*N%k=WN|4VhaUtjC<)W?<9SUU9c{=fY6`B>`UKH%xsVd+~)PaWb^u5^ZFczk8_*z=D57yPG9Jo*X4Y<&GWg<{k$K2KexH&@%s6E zoX2r~oIm&TKDo_tcpUGWrl9M9+D zoCmi#Zin|cye?ly{W^ReIKH$kJ!#+l8(%sg$vWpj$TxT0q5{%9HB^^{FOkR~13KuMza_Dy?yepwpEE75qSu-8q5+ zPtp1Z3G$#~vY()=GFoR3L4$V_6ugsi-a*j1%>>0zDc?v?Cn`@i5M)SYKnX#;iwSaE zP0+Jdl-E*%a+Xm3=@dVWAQ_c~sRXqp1WlYrkb=t5M1ty71VvMEil@G51UbYIls=W9 z*U<$15JJ$Zu>>hc6ZCTsLCpdQa`PuB0|+V`NKlu4wC6swKOcfpdl2+h2b$N0pr2Y2 zbhSC{!J437tOz<{LQp)F?nZR}nh=!xw}{Ta5K%WOgC2;e;J%2Ge;1K(S47L|Mf9*% zMCX1MQTRm>S^rB!#m7aIT`rlI9_ziQPn` zaTifTXAwQZ4GST#H8KXp!SWEy7wY z+8U}wheEXI)hI1G=&41EI%&~nD=j+xR)Zp5YS8ru8kBrfgT#v(6jG@{)h9G)!vPH% zMdj%x4I;%F9{|S+0yTZGnVZ4#)meL@ppUrTM$lYC&KB+KeE_o5bwcE-&P!lRa3j6t6o9ppnZ0Ck3y?2}gRhKR!7ueEC`KWyxjXJdd`OT#H1`OGq2mHr{H>H{EXJ?! z6|1-BZ7UY*bfyi{+bu6LJ{QIy=J%Xu32BCTct}nr%m00T5|~|z2jl0n!8~m)hI^N> zk?<79)_XGlAs=gobNf+qrduWVVR-n^Gd7<#$sVw!CI~X>dccjD1yB<1#P~#1rQnj` zwxHh93Hn{nV!6`%p33loCrM-A(ac`3enVflvabOb9hS~L-F%eZzKdg4MnU6|hq!){ z3mq4PeVZegUzVSPRc@*BfU_02@}#sbf?(b%ke<7t;C^%zvy)y{v--IWw^_fqL5FSr zXd=vYTY>Mq4#PnUuHus>1~9??8ooDh5sR@d zPnqD64w>*%ofTZoUIOz+bYS(GJv@j5ucY9@oN#70PrlFa9Gza^^pG3uca6P{zr?p! zmM_Ck9gkVOv&#||6P3MzX_Z_w3GQ^615xhFz-gZD{I{%J&-}+XK4y5iQ3gzD^qD=(aDZU`^3CCX4F6oJE3-p4NcSW`u$AlX zJx?s!Lx;?kP*U&*l6qg?yccd?rSpB;5Y2KdJLSWP>$>aZ`wi|a-{_AP@K^9MaDHS9 zv#p|_rqwLy5s(IHss(VgNO#YOtWtn8WaBZ958~TFyV&(Gys0(b*~|}y9-GAa3uy5z zHa?+SLwWL2I9KToXNQbrHL4BfgVpg2FwM3Ahn_0DCoUVbViI(7$N~R@WehL>{7`%$ z)C&S#zs6JM#6$N@iy7zgR?ZB6<)-;;J_;uvXSUKh5&vYx*+3{FLq-oZ@n6=N5PHiQ>UJ!FUv79po_Q{qlFl$|_c92n?g!`Bj)Y3P6sS*1 zg+I<_K&g2(KGINui?elWD9vkap$p?>Qqm8GhXzC9cW?1K`vfp7S_)Hbotdv>^lYZ3 z_3;^WhSjg?av7g{UO-5cn$8zOUL2 zhD&eLAui94VP4QgFuVoUo3U}VEgcht9RmQe*A~m4j=zP=Mp-idk7i#}EeN4^#=zs5 zdC;c)cu<#p4M$IHXSL3^o`{z^hqIWQV{#bw)Ilk*DZkHq?z-#8cxqEvlZD{MK^bfw zmj#VyWDLLR!XmuW#2H9CD;6JGXMQ~+75DxYp0{6Cb zVE(OETAXuUcTevd9}5lJ$KZ?Jx_e4x>^!(&6~MTyk4t9w5Hk+Qm_y%oVSOSU6w$X5dyTe&jW2qYl>kGM)A7+OFEZS zStB5CZUMvVx@;g!Xj}|yle_@^I~(S%8w8m@1%ln!I^2e&uo}18MnSig@8pwi#e?D5 zqj*qM6DX}tfhSrFxhM1CovREQugt)=;<`YL+YxM5Yr{CVeS@KV!bsM8oB4D;yLf#q z)z?%zgQxFsyecUI6o*roKf!^(gqUO){A&vITGa(QJT1f12TXz;%l^c(Bee|g=Du8T z2$jK6d>m(-_kx?pEE&J9rGMcg%^u_Z^}7E^()rM*=R9y7>I<4F1Uio0ieojA*rT;6 z{B|QBr|nlVyv27G!P0g|L7AnkvJFv(hX ze~tH-?t_95kYvy1Z*9Urre`kqVsS~UM}os9Z~W)Y?()q2LtuFNPIkQwUDFwkVcqrY zmTw6&f>+?v3%kSb&n{ujv(dO9Wv|>KSpj}i-0>BIj?l(!6?Por3$c~ec%#`Kd?mRY zckX?b)fzaVBd$-F2Bt+Ccr#!+<8yE7UHPW!F^q%zpb_AeP1yBTE4Rcq<0GIzW-PzF zZ6@4uE5=stx;^aOymXl7E`xDf(x7VM9=StSK3wkD5BqF(fwM_AICSlJxVTKWAI$4) z01MSFa3a_UvPUh(*maxyl-d^ZI@{um8e_&`$!$0IZDcy$VrvfNvz>5h%dcQq_ib!_ z*ZnHR6vBz7GWhqTXjoBi2nVKV;ILy3M9uDuouZrLu+(H2ci=oW_*UAp3&CxdbRFyN z2Mdpv;$f{*;PvdS_?XNZGA4w}@A-^}xnZU_D}5HOJXnP7#{7&m#mivs*+!f^aWD=G zY5_-Lx8swm*Wfis3jL%>F<}m+=)tT_Gfy-Hck0P^~&*XXm^x0b^$Lo{v z40?auAHJ!4`qJK7 za(s5^e__$j{`B37!)KX?41^E;NZ)7lU-RoDefkvmIZ`0~`tvD|k4U;VGx|uM&iCg? zPt3lE#GWDbkv^R_=~K_g<4T*4NbgrGP+?-?{yxIdkiWnZ1nI+*bZ|X^PKQqnmP8*EE~yG zR+-7@v&+fU+>D_wX49FOm7qd>eSNWN=8$EU@p+wAa2F+_dT&U2>SQE1u91aW* zja?fZ9SV#Eh6A)J=3`QAX_;2v@9`P5W=icBafngxZ`F8XOrOVesu7zQcm= z1o*Bk6aJh*yQB2{a=bsHR@X$|z1EUtgg}pYE?gIBa4g(Fu7G?x;ZUu;h=i<-mwKrN zrE9?b8i3pG=Cd15d)PAv`eC(x@?EN(2@qXg} ztHYt7%lj$!qViw}*`DMX1H}#!Wyxc@v#62nntfTs)?NPBsT=XqA&6jJT`Q*bB^?(iSxOL#( z_OBa`7|%_ASbR4fMw4;>)>w=w@@~23i&>~YAs6;`%=;VZpKF8jccXGXvW=Vmuz9)p zgA9E{9=@0X%Bw){N Fe* Date: Thu, 29 Apr 2021 09:52:40 +0100 Subject: [PATCH 87/92] Change the default parameter of SSD to train the full VGG16 and remove the catch of exception for eval only. --- references/detection/train.py | 12 +++--------- torchvision/models/detection/ssd.py | 2 +- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/references/detection/train.py b/references/detection/train.py index 26708ebb21e..81f170e73d0 100644 --- a/references/detection/train.py +++ b/references/detection/train.py @@ -114,15 +114,9 @@ def main(args): if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) - try: - optimizer.load_state_dict(checkpoint['optimizer']) - lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) - args.start_epoch = checkpoint['epoch'] + 1 - except Exception as e: - if args.test_only: - pass - else: - raise e + optimizer.load_state_dict(checkpoint['optimizer']) + lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) + args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: evaluate(model, data_loader_test, device=device) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index de4a8cb06e9..a0f2ad0f1e9 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -476,7 +476,7 @@ def _vgg_extractor(backbone_name: str, highres: bool, progress: bool, pretrained def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: int = 91, pretrained_backbone: bool = True, trainable_backbone_layers: Optional[int] = None, **kwargs: Any): trainable_backbone_layers = _validate_trainable_layers( - pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3) + pretrained or pretrained_backbone, trainable_backbone_layers, 5, 5) if pretrained: # no need to download the backbone if pretrained is set From a62d4e679ed232b053db1d21cdc251c83aaa4bcb Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 29 Apr 2021 11:36:49 +0100 Subject: [PATCH 88/92] Adding documentation --- torchvision/models/detection/_utils.py | 8 ++- torchvision/models/detection/anchor_utils.py | 15 ++++- torchvision/models/detection/ssd.py | 70 +++++++++++++++++++- 3 files changed, 89 insertions(+), 4 deletions(-) diff --git a/torchvision/models/detection/_utils.py b/torchvision/models/detection/_utils.py index d2307c08389..40281b39b6b 100644 --- a/torchvision/models/detection/_utils.py +++ b/torchvision/models/detection/_utils.py @@ -1,7 +1,7 @@ import math - import torch +from collections import OrderedDict from torch import Tensor from typing import List, Tuple @@ -386,6 +386,7 @@ def retrieve_out_channels(model, size): Args: model (nn.Module): The model for which we estimate the out_channels. + It should return a single Tensor or an OrderedDict[Tensor]. size (Tuple[int, int]): The size (wxh) of the input. Returns: @@ -398,7 +399,10 @@ def retrieve_out_channels(model, size): # Use dummy data to retrieve the feature map sizes to avoid hard-coding their values device = next(model.parameters()).device tmp_img = torch.zeros((1, 3, size[1], size[0]), device=device) - out_channels = [x.size(1) for x in model(tmp_img).values()] + features = model(tmp_img) + if isinstance(features, torch.Tensor): + features = OrderedDict([('0', features)]) + out_channels = [x.size(1) for x in features.values()] if in_training: model.train() diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index 841b72e40fd..ffede13e6af 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -159,10 +159,23 @@ def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Ten class DefaultBoxGenerator(nn.Module): + """ + This module generates the default boxes of SSD for a set of feature maps and image sizes. + + Args: + aspect_ratios (List[List[int]]): A list with all the aspect ratios used in each feature map. + min_ratio (float): The minimum scale :math:`\text{s}_{\text{min}}` of the default boxes used in the estimation + of the scales of each feature map. + max_ratio (float): The maximum scale :math:`\text{s}_{\text{max}}` of the default boxes used in the estimation + of the scales of each feature map. + steps (List[int]], optional): It's a hyper-parameter that affects the tiling of defalt boxes. If not provided + it will be estimated from the data. + clip (bool): Whether the standardized values of default boxes should be clipped between 0 and 1. The clipping + is applied while the boxes are encoded in format ``(cx, cy, w, h)``. + """ def __init__(self, aspect_ratios: List[List[int]], min_ratio: float = 0.15, max_ratio: float = 0.9, steps: Optional[List[int]] = None, clip: bool = True): - # TODO: Add documentation super().__init__() if steps is not None: assert len(aspect_ratios) == len(steps) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index a0f2ad0f1e9..8ea917f5469 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -106,6 +106,57 @@ def __init__(self, in_channels: List[int], num_anchors: List[int]): class SSD(nn.Module): + """ + Implements SSD architecture from `"SSD: Single Shot MultiBox Detector" `_. + + The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each + image, and should be in 0-1 range. Different images can have different sizes but they will be resized + to a fixed size before passing it to the backbone. + + The behavior of the model changes depending if it is in training or evaluation mode. + + During training, the model expects both the input tensors, as well as a targets (list of dictionary), + containing: + - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. + - labels (Int64Tensor[N]): the class label for each ground-truth box + + The model returns a Dict[Tensor] during training, containing the classification and regression + losses. + + During inference, the model requires only the input tensors, and returns the post-processed + predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as + follows: + - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. + - labels (Int64Tensor[N]): the predicted labels for each image + - scores (Tensor[N]): the scores for each prediction + + Args: + backbone (nn.Module): the network used to compute the features for the model. + It should contain an out_channels attribute with the list of the output channels of + each feature map. The backbone should return a single Tensor or an OrderedDict[Tensor]. + anchor_generator (DefaultBoxGenerator): module that generates the default boxes for a + set of feature maps. + size (Tuple[int, int]): the width and height to which images will be rescaled before feeding them + to the backbone. + num_classes (int): number of output classes of the model (excluding the background). + image_mean (Tuple[float, float, float]): mean values used for input normalization. + They are generally the mean values of the dataset on which the backbone has been trained + on + image_std (Tuple[float, float, float]): std values used for input normalization. + They are generally the std values of the dataset on which the backbone has been trained on + head: + score_thresh (float): Score threshold used for postprocessing the detections. + nms_thresh (float): NMS threshold used for postprocessing the detections. + detections_per_img (int): Number of best detections to keep after NMS. + iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be + considered as positive during training. + topk_candidates (int): Number of best detections to keep before NMS. + positive_fraction (float): a number between 0 and 1 which indicates the proportion of positive + proposals used during the training of the classification head. It is used to estimate the negative to + positive ratio. + """ __annotations__ = { 'box_coder': det_utils.BoxCoder, 'proposal_matcher': det_utils.Matcher, @@ -121,7 +172,6 @@ def __init__(self, backbone: nn.Module, anchor_generator: DefaultBoxGenerator, iou_thresh: float = 0.5, topk_candidates: int = 400, positive_fraction: float = 0.25): - # TODO: Add documentation super().__init__() self.backbone = backbone @@ -475,6 +525,24 @@ def _vgg_extractor(backbone_name: str, highres: bool, progress: bool, pretrained def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: int = 91, pretrained_backbone: bool = True, trainable_backbone_layers: Optional[int] = None, **kwargs: Any): + """ + Constructs an SSD model with a VGG16 backbone. See `SSD` for more details. + + Example: + + >>> model = torchvision.models.detection.ssd300_vgg16(pretrained=True) + >>> model.eval() + >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)] + >>> predictions = model(x) + + Args: + pretrained (bool): If True, returns a model pre-trained on COCO train2017 + progress (bool): If True, displays a progress bar of the download to stderr + num_classes (int): number of output classes of the model (including the background) + pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet + trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting from final block. + Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable. + """ trainable_backbone_layers = _validate_trainable_layers( pretrained or pretrained_backbone, trainable_backbone_layers, 5, 5) From bc8063ab1f53e6e4572856fd63e4efdb6b737783 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 29 Apr 2021 13:02:20 +0100 Subject: [PATCH 89/92] Adding weights and updating readmes. --- docs/source/models.rst | 19 ++++++++++++++----- references/detection/README.md | 8 ++++++++ torchvision/models/detection/ssd.py | 2 +- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/docs/source/models.rst b/docs/source/models.rst index 09ec450574b..c34afd38b90 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -381,17 +381,18 @@ Object Detection, Instance Segmentation and Person Keypoint Detection The models subpackage contains definitions for the following model architectures for detection: -- `Faster R-CNN ResNet-50 FPN `_ -- `Mask R-CNN ResNet-50 FPN `_ +- `Faster R-CNN `_ +- `Mask R-CNN `_ +- `RetinaNet `_ +- `SSD `_ The pre-trained models for detection, instance segmentation and keypoint detection are initialized with the classification models in torchvision. The models expect a list of ``Tensor[C, H, W]``, in the range ``0-1``. -The models internally resize the images so that they have a minimum size -of ``800``. This option can be changed by passing the option ``min_size`` -to the constructor of the models. +The models internally resize the images but the behaviour varies depending +on the model. Check the constructor of the models for more information. For object detection and instance segmentation, the pre-trained @@ -425,6 +426,7 @@ Faster R-CNN ResNet-50 FPN 37.0 - - Faster R-CNN MobileNetV3-Large FPN 32.8 - - Faster R-CNN MobileNetV3-Large 320 FPN 22.8 - - RetinaNet ResNet-50 FPN 36.4 - - +SSD VGG16 25.0 - - Mask R-CNN ResNet-50 FPN 37.9 34.6 - ====================================== ======= ======== =========== @@ -483,6 +485,7 @@ Faster R-CNN ResNet-50 FPN 0.2288 0.0590 Faster R-CNN MobileNetV3-Large FPN 0.1020 0.0415 1.0 Faster R-CNN MobileNetV3-Large 320 FPN 0.0978 0.0376 0.6 RetinaNet ResNet-50 FPN 0.2514 0.0939 4.1 +SSD VGG16 0.2100 0.0760 1.5 Mask R-CNN ResNet-50 FPN 0.2728 0.0903 5.4 Keypoint R-CNN ResNet-50 FPN 0.3789 0.1242 6.8 ====================================== =================== ================== =========== @@ -502,6 +505,12 @@ RetinaNet .. autofunction:: torchvision.models.detection.retinanet_resnet50_fpn +SSD +------------ + +.. autofunction:: torchvision.models.detection.ssd300_vgg16 + + Mask R-CNN ---------- diff --git a/references/detection/README.md b/references/detection/README.md index c8eaf46da5f..e4d52869d35 100644 --- a/references/detection/README.md +++ b/references/detection/README.md @@ -48,6 +48,14 @@ python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\ --lr-steps 16 22 --aspect-ratio-group-factor 3 --lr 0.01 ``` +### SSD VGG16 +``` +python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\ + --dataset coco --model ssd300_vgg16 --epochs 120\ + --lr-steps 80 110 --aspect-ratio-group-factor 3 --lr 0.002 --batch-size 4\ + --weight-decay 0.0005 --data-augmentation ssd +``` + ### Mask R-CNN ``` diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 8ea917f5469..be4e2e368ea 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -17,7 +17,7 @@ __all__ = ['SSD', 'ssd300_vgg16'] model_urls = { - 'ssd300_vgg16_coco': None, # TODO: Add url with weights + add the model on the documentation and references readme + 'ssd300_vgg16_coco': 'https://download.pytorch.org/models/ssd300_vgg16_coco-d69bfef3.pth', } backbone_urls = { From 476019735618d6c1f81acb24abeea8266ef1698c Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 30 Apr 2021 09:12:31 +0100 Subject: [PATCH 90/92] Update the model weights with a more performing model. --- docs/source/models.rst | 4 ++-- torchvision/models/detection/ssd.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/models.rst b/docs/source/models.rst index c34afd38b90..c70cd07979f 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -426,7 +426,7 @@ Faster R-CNN ResNet-50 FPN 37.0 - - Faster R-CNN MobileNetV3-Large FPN 32.8 - - Faster R-CNN MobileNetV3-Large 320 FPN 22.8 - - RetinaNet ResNet-50 FPN 36.4 - - -SSD VGG16 25.0 - - +SSD VGG16 25.1 - - Mask R-CNN ResNet-50 FPN 37.9 34.6 - ====================================== ======= ======== =========== @@ -485,7 +485,7 @@ Faster R-CNN ResNet-50 FPN 0.2288 0.0590 Faster R-CNN MobileNetV3-Large FPN 0.1020 0.0415 1.0 Faster R-CNN MobileNetV3-Large 320 FPN 0.0978 0.0376 0.6 RetinaNet ResNet-50 FPN 0.2514 0.0939 4.1 -SSD VGG16 0.2100 0.0760 1.5 +SSD VGG16 0.2093 0.0744 1.5 Mask R-CNN ResNet-50 FPN 0.2728 0.0903 5.4 Keypoint R-CNN ResNet-50 FPN 0.3789 0.1242 6.8 ====================================== =================== ================== =========== diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index be4e2e368ea..393a85d80b8 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -17,7 +17,7 @@ __all__ = ['SSD', 'ssd300_vgg16'] model_urls = { - 'ssd300_vgg16_coco': 'https://download.pytorch.org/models/ssd300_vgg16_coco-d69bfef3.pth', + 'ssd300_vgg16_coco': 'https://download.pytorch.org/models/ssd300_vgg16_coco-b556d3b4.pth', } backbone_urls = { From 365d1ef66d19a39c3017328c021a51964519a501 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 30 Apr 2021 16:32:10 +0100 Subject: [PATCH 91/92] Adding doc for head. --- torchvision/models/detection/ssd.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 393a85d80b8..fcb79d6e651 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -146,7 +146,8 @@ class SSD(nn.Module): on image_std (Tuple[float, float, float]): std values used for input normalization. They are generally the std values of the dataset on which the backbone has been trained on - head: + head (nn.Module, optional): Module run on top of the backbone features. Defaults to a module containing + a classification and regression module. score_thresh (float): Score threshold used for postprocessing the detections. nms_thresh (float): NMS threshold used for postprocessing the detections. detections_per_img (int): Number of best detections to keep after NMS. From 6c94ff072cd322fe461f2c75e2505c17c57b2fcf Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 30 Apr 2021 16:53:13 +0100 Subject: [PATCH 92/92] Restore import. --- torchvision/models/detection/anchor_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index a540565cd0b..8a8b08399ab 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -2,7 +2,7 @@ import torch from torch import nn, Tensor -from typing import List +from typing import List, Optional from .image_list import ImageList