Merge branch 'master' into models/mobilenetv3

pytorch · Jan 3, 2021 · 5d0a664 · 5d0a664
2 parents e4d130f + 7b9d30e
commit 5d0a664
Show file tree

Hide file tree

Showing 40 changed files with 304 additions and 985 deletions.
diff --git a/references/detection/group_by_aspect_ratio.py b/references/detection/group_by_aspect_ratio.py
@@ -26,7 +26,7 @@ class GroupedBatchSampler(BatchSampler):
     It enforces that the batch only contain elements from the same group.
     It also tries to provide mini-batches which follows an ordering which is
     as close as possible to the ordering from the original sampler.
-    Arguments:
+    Args:
         sampler (Sampler): Base sampler.
         group_ids (list[int]): If the sampler produces indices in range [0, N),
             `group_ids` must be a list of `N` ints which contains the group id of each sample.

diff --git a/setup.py b/setup.py
@@ -29,12 +29,14 @@ def get_dist(pkgname):
         return None
 
 
-version = '0.9.0a0'
+cwd = os.path.dirname(os.path.abspath(__file__))
+
+version_txt = os.path.join(cwd, 'version.txt')
+with open(version_txt, 'r') as f:
+    version = f.readline().strip()
 sha = 'Unknown'
 package_name = 'torchvision'
 
-cwd = os.path.dirname(os.path.abspath(__file__))
-
 try:
     sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip()
 except Exception:

diff --git a/torchvision/csrc/io/image/cpu/readjpeg_cpu.cpp b/torchvision/csrc/io/image/cpu/readjpeg_cpu.cpp
@@ -117,7 +117,7 @@ torch::Tensor decodeJPEG(const torch::Tensor& data, ImageReadMode mode) {
        */
       default:
         jpeg_destroy_decompress(&cinfo);
-        TORCH_CHECK(false, "Provided mode not supported");
+        TORCH_CHECK(false, "The provided mode is not supported for JPEG files");
     }
 
     jpeg_calc_output_dimensions(&cinfo);

diff --git a/torchvision/csrc/io/image/cpu/readpng_cpu.cpp b/torchvision/csrc/io/image/cpu/readpng_cpu.cpp
@@ -143,7 +143,7 @@ torch::Tensor decodePNG(const torch::Tensor& data, ImageReadMode mode) {
         break;
       default:
         png_destroy_read_struct(&png_ptr, &info_ptr, nullptr);
-        TORCH_CHECK(false, "Provided mode not supported");
+        TORCH_CHECK(false, "The provided mode is not supported for PNG files");
     }
 
     png_read_update_info(png_ptr, info_ptr);

diff --git a/torchvision/csrc/io/image/image_read_mode.h b/torchvision/csrc/io/image/image_read_mode.h
@@ -2,8 +2,8 @@
 
 /* Should be kept in-sync with Python ImageReadMode enum */
 using ImageReadMode = int64_t;
-#define IMAGE_READ_MODE_UNCHANGED 0
-#define IMAGE_READ_MODE_GRAY 1
-#define IMAGE_READ_MODE_GRAY_ALPHA 2
-#define IMAGE_READ_MODE_RGB 3
-#define IMAGE_READ_MODE_RGB_ALPHA 4
+const ImageReadMode IMAGE_READ_MODE_UNCHANGED = 0;
+const ImageReadMode IMAGE_READ_MODE_GRAY = 1;
+const ImageReadMode IMAGE_READ_MODE_GRAY_ALPHA = 2;
+const ImageReadMode IMAGE_READ_MODE_RGB = 3;
+const ImageReadMode IMAGE_READ_MODE_RGB_ALPHA = 4;
diff --git a/torchvision/datasets/samplers/clip_sampler.py b/torchvision/datasets/samplers/clip_sampler.py
@@ -111,7 +111,7 @@ class UniformClipSampler(Sampler):
     When number of unique clips in the video is fewer than num_video_clips_per_video,
     repeat the clips until `num_video_clips_per_video` clips are collected
 
-    Arguments:
+    Args:
         video_clips (VideoClips): video clips to sample from
         num_clips_per_video (int): number of clips to be sampled per video
     """
@@ -151,7 +151,7 @@ class RandomClipSampler(Sampler):
     """
     Samples at most `max_video_clips_per_video` clips for each video randomly
 
-    Arguments:
+    Args:
         video_clips (VideoClips): video clips to sample from
         max_clips_per_video (int): maximum number of clips to be sampled per video
     """

diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py
@@ -88,7 +88,7 @@ class VideoClips(object):
     Recreating the clips for different clip lengths is fast, and can be done
     with the `compute_clips` method.
 
-    Arguments:
+    Args:
         video_paths (List[str]): paths to the video files
         clip_length_in_frames (int): size of a clip in number of frames
         frames_between_clips (int): step (in frames) between each clip
@@ -227,7 +227,7 @@ def compute_clips(self, num_frames, step, frame_rate=None):
         Always returns clips of size `num_frames`, meaning that the
         last few frames in a video can potentially be dropped.
 
-        Arguments:
+        Args:
             num_frames (int): number of frames for the clip
             step (int): distance between two clips
         """
@@ -285,7 +285,7 @@ def get_clip(self, idx):
         """
         Gets a subclip from a list of videos.
 
-        Arguments:
+        Args:
             idx (int): index of the subclip. Must be between 0 and num_clips().
 
         Returns:

diff --git a/torchvision/io/image.py b/torchvision/io/image.py
@@ -50,6 +50,15 @@
 
 
 class ImageReadMode(Enum):
+    """
+    Support for various modes while reading images.
+
+    Use `ImageReadMode.UNCHANGED` for loading the image as-is,
+    `ImageReadMode.GRAY` for converting to grayscale,
+    `ImageReadMode.GRAY_ALPHA` for grayscale with transparency,
+    `ImageReadMode.RGB` for RGB and `ImageReadMode.RGB_ALPHA` for
+    RGB with transparency.
+    """
     UNCHANGED = 0
     GRAY = 1
     GRAY_ALPHA = 2
@@ -62,7 +71,7 @@ def read_file(path: str) -> torch.Tensor:
     Reads and outputs the bytes contents of a file as a uint8 Tensor
     with one dimension.
 
-    Arguments:
+    Args:
         path (str): the path to the file to be read
 
     Returns:
@@ -77,7 +86,7 @@ def write_file(filename: str, data: torch.Tensor) -> None:
     Writes the contents of a uint8 tensor with one dimension to a
     file.
 
-    Arguments:
+    Args:
         filename (str): the path to the file to be written
         data (Tensor): the contents to be written to the output file
     """
@@ -90,15 +99,13 @@ def decode_png(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANGE
     Optionally converts the image to the desired format.
     The values of the output tensor are uint8 between 0 and 255.
 
-    Arguments:
+    Args:
         input (Tensor[1]): a one dimensional uint8 tensor containing
     the raw bytes of the PNG image.
         mode (ImageReadMode): the read mode used for optionally
-    converting the image. Use `ImageReadMode.UNCHANGED` for loading
-    the image as-is, `ImageReadMode.GRAY` for converting to grayscale,
-    `ImageReadMode.GRAY_ALPHA` for grayscale with transparency,
-    `ImageReadMode.RGB` for RGB and `ImageReadMode.RGB_ALPHA` for
-     RGB with transparency. Default: `ImageReadMode.UNCHANGED`
+    converting the image. Default: `ImageReadMode.UNCHANGED`.
+    See `ImageReadMode` class for more information on various
+    available modes.
 
     Returns:
         output (Tensor[image_channels, image_height, image_width])
@@ -155,13 +162,13 @@ def decode_jpeg(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANG
     Optionally converts the image to the desired format.
     The values of the output tensor are uint8 between 0 and 255.
 
-    Arguments:
+    Args:
         input (Tensor[1]): a one dimensional uint8 tensor containing
     the raw bytes of the JPEG image.
         mode (ImageReadMode): the read mode used for optionally
-    converting the image. Use `ImageReadMode.UNCHANGED` for loading
-    the image as-is, `ImageReadMode.GRAY` for converting to grayscale
-    and `ImageReadMode.RGB` for RGB. Default: `ImageReadMode.UNCHANGED`
+    converting the image. Default: `ImageReadMode.UNCHANGED`.
+    See `ImageReadMode` class for more information on various
+    available modes.
 
     Returns:
         output (Tensor[image_channels, image_height, image_width])
@@ -229,11 +236,10 @@ def decode_image(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHAN
         a one dimensional uint8 tensor containing the raw bytes of the
         PNG or JPEG image.
     mode: ImageReadMode
-        the read mode used for optionally converting the image. JPEG
-        and PNG images have different permitted values. The default
-        value is `ImageReadMode.UNCHANGED` and it keeps the image as-is.
-        See `decode_jpeg()` and `decode_png()` for more information.
-        Default: `ImageReadMode.UNCHANGED`
+        the read mode used for optionally converting the image.
+        Default: `ImageReadMode.UNCHANGED`.
+        See `ImageReadMode` class for more information on various
+        available modes.
 
     Returns
     -------
@@ -254,11 +260,10 @@ def read_image(path: str, mode: ImageReadMode = ImageReadMode.UNCHANGED) -> torc
     path: str
         path of the JPEG or PNG image.
     mode: ImageReadMode
-        the read mode used for optionally converting the image. JPEG
-        and PNG images have different permitted values. The default
-        value is `ImageReadMode.UNCHANGED` and it keeps the image as-is.
-        See `decode_jpeg()` and `decode_png()` for more information.
-        Default: `ImageReadMode.UNCHANGED`
+        the read mode used for optionally converting the image.
+        Default: `ImageReadMode.UNCHANGED`.
+        See `ImageReadMode` class for more information on various
+        available modes.
 
     Returns
     -------

diff --git a/torchvision/models/_utils.py b/torchvision/models/_utils.py
@@ -18,7 +18,7 @@ class IntermediateLayerGetter(nn.ModuleDict):
     assigned to the model. So if `model` is passed, `model.feature1` can
     be returned, but not `model.feature1.layer2`.
 
-    Arguments:
+    Args:
         model (nn.Module): model on which we will extract the features
         return_layers (Dict[name, new_name]): a dict containing the names
             of the modules for which the activations will be returned as

diff --git a/torchvision/models/detection/_utils.py b/torchvision/models/detection/_utils.py
@@ -15,7 +15,7 @@ class BalancedPositiveNegativeSampler(object):
     def __init__(self, batch_size_per_image, positive_fraction):
         # type: (int, float) -> None
         """
-        Arguments:
+        Args:
             batch_size_per_image (int): number of elements to be selected per image
             positive_fraction (float): percentace of positive elements per batch
         """
@@ -25,7 +25,7 @@ def __init__(self, batch_size_per_image, positive_fraction):
     def __call__(self, matched_idxs):
         # type: (List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
         """
-        Arguments:
+        Args:
             matched idxs: list of tensors containing -1, 0 or positive values.
                 Each tensor corresponds to a specific image.
                 -1 values are ignored, 0 are considered as negatives and > 0 as
@@ -83,7 +83,7 @@ def encode_boxes(reference_boxes, proposals, weights):
     Encode a set of proposals with respect to some
     reference boxes
 
-    Arguments:
+    Args:
         reference_boxes (Tensor): reference boxes
         proposals (Tensor): boxes to be encoded
     """
@@ -133,7 +133,7 @@ class BoxCoder(object):
     def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)):
         # type: (Tuple[float, float, float, float], float) -> None
         """
-        Arguments:
+        Args:
             weights (4-element tuple)
             bbox_xform_clip (float)
         """
@@ -153,7 +153,7 @@ def encode_single(self, reference_boxes, proposals):
         Encode a set of proposals with respect to some
         reference boxes
 
-        Arguments:
+        Args:
             reference_boxes (Tensor): reference boxes
             proposals (Tensor): boxes to be encoded
         """
@@ -183,7 +183,7 @@ def decode_single(self, rel_codes, boxes):
         From a set of original boxes and encoded relative box offsets,
         get the decoded boxes.
 
-        Arguments:
+        Args:
             rel_codes (Tensor): encoded boxes
             boxes (Tensor): reference boxes.
         """
@@ -361,7 +361,7 @@ def overwrite_eps(model, eps):
     only when the pretrained weights are loaded to maintain compatibility
     with previous versions.
 
-    Arguments:
+    Args:
         model (nn.Module): The model on which we perform the overwrite.
         eps (float): The new value of eps.
     """

diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py
@@ -22,7 +22,7 @@ class AnchorGenerator(nn.Module):
     and AnchorGenerator will output a set of sizes[i] * aspect_ratios[i] anchors
     per spatial location for feature map i.
 
-    Arguments:
+    Args:
         sizes (Tuple[Tuple[int]]):
         aspect_ratios (Tuple[Tuple[float]]):
     """

diff --git a/torchvision/models/detection/backbone_utils.py b/torchvision/models/detection/backbone_utils.py
@@ -14,7 +14,7 @@ class BackboneWithFPN(nn.Module):
     Internally, it uses torchvision.models._utils.IntermediateLayerGetter to
     extract a submodel that returns the feature maps specified in return_layers.
     The same limitations of IntermediatLayerGetter apply here.
-    Arguments:
+    Args:
         backbone (nn.Module)
         return_layers (Dict[name, new_name]): a dict containing the names
             of the modules for which the activations will be returned as
@@ -73,7 +73,7 @@ def resnet_fpn_backbone(
         >>>    ('3', torch.Size([1, 256, 2, 2])),
         >>>    ('pool', torch.Size([1, 256, 1, 1]))]
 
-    Arguments:
+    Args:
         backbone_name (string): resnet architecture. Possible values are 'ResNet', 'resnet18', 'resnet34', 'resnet50',
              'resnet101', 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d', 'wide_resnet50_2', 'wide_resnet101_2'
         norm_layer (torchvision.ops): it is recommended to use the default value. For details visit:

diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py
@@ -49,7 +49,7 @@ class FasterRCNN(GeneralizedRCNN):
         - labels (Int64Tensor[N]): the predicted labels for each image
         - scores (Tensor[N]): the scores or each prediction
 
-    Arguments:
+    Args:
         backbone (nn.Module): the network used to compute the features for the model.
             It should contain a out_channels attribute, which indicates the number of output
             channels that each feature map has (and it should be the same for all feature maps).
@@ -239,7 +239,7 @@ class TwoMLPHead(nn.Module):
     """
     Standard heads for FPN-based models
 
-    Arguments:
+    Args:
         in_channels (int): number of input channels
         representation_size (int): size of the intermediate representation
     """
@@ -264,7 +264,7 @@ class FastRCNNPredictor(nn.Module):
     Standard classification + bounding box regression layers
     for Fast R-CNN.
 
-    Arguments:
+    Args:
         in_channels (int): number of input channels
         num_classes (int): number of output classes (including background)
     """
@@ -341,7 +341,7 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True,
         >>> # optionally, if you want to export the model to ONNX:
         >>> torch.onnx.export(model, x, "faster_rcnn.onnx", opset_version = 11)
 
-    Arguments:
+    Args:
         pretrained (bool): If True, returns a model pre-trained on COCO train2017
         progress (bool): If True, displays a progress bar of the download to stderr
         pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet

diff --git a/torchvision/models/detection/generalized_rcnn.py b/torchvision/models/detection/generalized_rcnn.py
@@ -14,7 +14,7 @@ class GeneralizedRCNN(nn.Module):
     """
     Main class for Generalized R-CNN.
 
-    Arguments:
+    Args:
         backbone (nn.Module):
         rpn (nn.Module):
         roi_heads (nn.Module): takes the features + the proposals from the RPN and computes
@@ -43,7 +43,7 @@ def eager_outputs(self, losses, detections):
     def forward(self, images, targets=None):
         # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
         """
-        Arguments:
+        Args:
             images (list[Tensor]): images to be processed
             targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional)
 

diff --git a/torchvision/models/detection/image_list.py b/torchvision/models/detection/image_list.py
@@ -14,7 +14,7 @@ class ImageList(object):
 
     def __init__(self, tensors: Tensor, image_sizes: List[Tuple[int, int]]):
         """
-        Arguments:
+        Args:
             tensors (tensor)
             image_sizes (list[tuple[int, int]])
         """

diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py
@@ -44,7 +44,7 @@ class KeypointRCNN(FasterRCNN):
         - scores (Tensor[N]): the scores or each prediction
         - keypoints (FloatTensor[N, K, 3]): the locations of the predicted keypoints, in [x, y, v] format.
 
-    Arguments:
+    Args:
         backbone (nn.Module): the network used to compute the features for the model.
             It should contain a out_channels attribute, which indicates the number of output
             channels that each feature map has (and it should be the same for all feature maps).
@@ -309,7 +309,7 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True,
         >>> # optionally, if you want to export the model to ONNX:
         >>> torch.onnx.export(model, x, "keypoint_rcnn.onnx", opset_version = 11)
 
-    Arguments:
+    Args:
         pretrained (bool): If True, returns a model pre-trained on COCO train2017
         progress (bool): If True, displays a progress bar of the download to stderr
         pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet