diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bd3e1bd..e13a37ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,27 @@ # Changelog +## 0.11.0 (WIP) + +- Feature extractor class changes: + + - `FeatureExtractor` and its built-in subclasses should now be imported like `from spacer.extractors import ` instead of `from spacer.extract_features import `. + + - High-level usage of `FeatureExtractor` instances is the same as before - invoking `__call__()` performs feature extraction on an image. However, subclass implementations should now generally define a `patches_to_features()` method instead of overriding `__call__()`. + + - There is now a `TorchExtractor` class which has details that are specific to PyTorch but not to EfficientNet. So, it's suitable as a starting point for a custom PyTorch extractor that uses another type of network. `EfficientNetExtractor` now inherits from TorchExtractor. + + - There are now `CROP_SIZE` and `BATCH_SIZE` class-level variables available. + +- Config and test changes: + + - Some former usages of `TEST_BUCKET` have been changed to `CN_FIXTURES_BUCKET`, to more clearly denote test fixtures that are currently only available to CoralNet devs. + + - The remaining usages of `TEST_BUCKET` are now usable by anyone with an AWS account. This can be any S3 bucket that you have read and write access to. + + - `TEST_EXTRACTORS_BUCKET` is now known as `CN_TEST_EXTRACTORS_BUCKET`, again denoting fixtures currently only available to CoralNet devs. + + Related to these changes, now more tests are runnable without needing CoralNet AWS credentials. More tests are runnable in GitHub Actions CI, as well (even though that doesn't use AWS at all). + ## 0.10.0 - AWS credentials can now be obtained through the following methods, in addition to spacer config values as before: diff --git a/README.md b/README.md index 4a8ae947..1f4c8369 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,7 @@ The `tasks.py` module has four functions which comprise the main interface of py The first step when analyzing an image, or preparing an image as training data, is extracting [features](https://en.wikipedia.org/wiki/Feature_(computer_vision)) from the image. For this step, you specify a set of points (pixel locations) in the image which you want to analyze. At each point, spacer will crop a square of pixels centered around that location and extract features based on that square. -You'll also need a feature extractor, but spacer does not provide one out of the box. Spacer's `extract_features.py` provides the Python classes `EfficientNetExtractor` for loading EfficientNet extractors in PyTorch format (CoralNet 1.0's default extraction scheme), and `VGG16CaffeExtractor` for loading VGG16 extractors in Caffe format (CoralNet's legacy extraction scheme). +You'll also need a feature extractor, but spacer does not provide one out of the box. `spacer/extractors` includes the Python classes `EfficientNetExtractor` for loading EfficientNet extractors in PyTorch format (CoralNet 1.0's default extraction scheme), and `VGG16CaffeExtractor` for loading VGG16 extractors in Caffe format (CoralNet's legacy extraction scheme). You'll either want to match one of these schemes so you can use the provided classes, or you'll have to write your own extractor class which inherits from the base class `FeatureExtractor`. Between the provided classes, the easier one to use will probably be `EfficientNetExtractor`, because Caffe is old software which is more complicated to install. @@ -120,7 +120,7 @@ If you're loading the extractor files remotely (from S3 or from a URL), the file The output of `extract_features()` is a single feature-vector file, which is a JSON file that is deserializable using the `data_classes.ImageFeatures` class. Example usage: ```python -from spacer.extract_features import EfficientNetExtractor +from spacer.extractors import EfficientNetExtractor from spacer.messages import DataLocation, ExtractFeaturesMsg from spacer.tasks import extract_features @@ -369,7 +369,7 @@ This basically does `extract_features` and `classify_features` together in one g Takes an image, a list of pixel locations on that image, a feature extractor, and a classifier. Produces prediction results (scores) for the image points, as posterior probabilities for each class. Example: ```python -from spacer.extract_features import EfficientNetExtractor +from spacer.extractors import EfficientNetExtractor from spacer.messages import DataLocation, ClassifyImageMsg from spacer.tasks import classify_image @@ -406,7 +406,7 @@ for row, col, scores in return_message.scores: If you are using the docker build or local install, you can run the test suite by running `python -m unittest` from the `spacer` directory. -- Expect many tests to be skipped, since most test fixtures aren't set up for public access yet. +- Some tests require Amazon S3 config, Caffe installation, and/or CoralNet infrastructure access. The applicable tests will be skipped if your config doesn't support them. - Run just a single test module with a command like `python -m unittest tests.test_tasks`, or just `python -m tests.test_tasks` (the latter invokes the `if __name__ == '__main__':` part of the module). diff --git a/spacer/caffe_utils.py b/spacer/caffe_utils.py deleted file mode 100644 index 4330e34c..00000000 --- a/spacer/caffe_utils.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -This file contains a set of caffe utility functions copied into this repo for -simplicity. Since support for Caffe will be deprecate, -these are only lightly cleaned up from their original state. -""" - -from __future__ import annotations -from copy import copy -from functools import lru_cache -from typing import Any - -import caffe -import numpy as np - -from spacer import config - - -class Transformer: - """ - Transformer is a class for preprocessing and deprocessing images - according to the vgg16 pre-processing paradigm. - (scaling and mean subtraction.). - """ - - def __init__(self, mean: tuple = (0, 0, 0)) -> None: - self.mean = np.array(mean, dtype=np.float32) - self.scale = 1.0 - - def preprocess(self, im: np.ndarray) -> np.ndarray: - """ - preprocess() emulate the pre-processing occurring - in the vgg16 caffe prototxt. - :param im: numpy array. - :return: normalized numpy array. - """ - im = np.float32(im) - im = im[:, :, ::-1] # change to BGR - im -= self.mean - im *= self.scale - im = im.transpose((2, 0, 1)) - - return im - - def deprocess(self, im: np.ndarray) -> np.ndarray: - """ - inverse of preprocess(). - :param im: normalized numpy array. - :return: original image. - """ - im = im.transpose((1, 2, 0)) - im /= self.scale - im += self.mean - im = im[:, :, ::-1] # change to RGB - - return np.uint8(im) - - -def classify_from_imlist(im_list: list, - net: Any, - transformer: Transformer, - batch_size: int, - scorelayer: str = 'score', - startlayer: str = 'conv1_1') -> list: - """ - classify_from_imlist classifies a list of images and returns - estimated labels and scores. - Only support classification nets (not FCNs). - :param im_list: list of images to classify (each stored as a numpy array). - :param net: caffe net object. - :param transformer: transformer object as defined above. - :param batch_size: batch size for the net. - :param scorelayer: name of the score (the last conv) layer. - :param startlayer: name of first convolutional layer. - :return: features list. - """ - with config.log_entry_and_exit('forward pass through net'): - scorelist = [] - for b in range(len(im_list) // batch_size + 1): - for i in range(batch_size): - pos = b * batch_size + i - if pos < len(im_list): - net.blobs['data'].data[i, :, :, :] = \ - transformer.preprocess(im_list[pos]) - net.forward(start=startlayer) - scorelist.extend(list(copy(net.blobs[scorelayer].data). - astype(float))) - - scorelist = scorelist[:len(im_list)] - - return scorelist - - -@lru_cache(maxsize=1) -def load_net(modeldef_path: str, - modelweighs_path: str) -> Any: - """ - load pretrained net. - :param modeldef_path: model path. - :param modelweighs_path: pretrained weights path. - :return: pretrained model. - """ - return caffe.Net(modeldef_path, modelweighs_path, caffe.TEST) - - -def classify_from_patchlist(patchlist: list, - pyparams: dict, - modeldef_path: str, - modelweighs_path: str, - scorelayer: str = 'score', - startlayer: str = 'conv1_1') -> list: - """ - extract features of a list of patches - :param patchlist: a list of patches (cropped images). - :param pyparams: a set of parameters. - :param modeldef_path: model path. - :param modelweighs_path: pretrained weights path. - :param scorelayer: name of the score (the last conv) layer. - :param startlayer: name of first convolutional layer. - :return: a list of features - """ - # Setup caffe - caffe.set_mode_cpu() - net = load_net(modeldef_path, modelweighs_path) - - # Classify - transformer = Transformer(pyparams['im_mean']) - scorelist = classify_from_imlist( - patchlist, net, transformer, pyparams['batch_size'], - scorelayer=scorelayer, startlayer=startlayer - ) - - return scorelist diff --git a/spacer/config.py b/spacer/config.py index 3121f452..630f0fb8 100644 --- a/spacer/config.py +++ b/spacer/config.py @@ -258,21 +258,21 @@ def __exit__(self, exc_type, exc, exc_tb): 'minibatch' ] -# For extractors used in unit tests. -TEST_EXTRACTORS_BUCKET = get_config_value( - 'TEST_EXTRACTORS_BUCKET', default=None) -# For other fixtures used in unit tests. -# -# At least for now, the main reason these bucket names are pulled from -# config is to not expose the bucket names used by the PySpacer core devs. -# However, since these test files are not publicly linked and need to -# live in an S3 bucket with specific filenames (specified by TEST_EXTRACTORS -# and individual tests), the tests are still onerous to set up for anyone -# besides the core devs. This should be addressed sometime. +# Amazon S3 bucket for temporarily storing data during unit tests. +# You'll need write access to this bucket to run the applicable tests. TEST_BUCKET = get_config_value('TEST_BUCKET', default=None) -# A few other fixtures live here. +# A few testing fixtures live here. LOCAL_FIXTURE_DIR = str(APP_DIR / 'tests' / 'fixtures') +# And the rest of the testing fixtures live in these CoralNet-owned +# private buckets. (CoralNet devs should specify the names of the buckets +# in their environment.) +# These tests and fixtures should be reorganized sometime so that anyone can +# run the applicable tests. +CN_TEST_EXTRACTORS_BUCKET = get_config_value( + 'CN_TEST_EXTRACTORS_BUCKET', default=None) +CN_FIXTURES_BUCKET = get_config_value('CN_FIXTURES_BUCKET', default=None) + STORAGE_TYPES = [ 's3', 'filesystem', @@ -315,10 +315,12 @@ def __exit__(self, exc_type, exc, exc_tb): # This is required if you're loading feature extractors from a remote # source (S3 or URL). 'EXTRACTORS_CACHE_DIR', - # These are required to run certain unit tests. They're also not really - # usable by anyone besides spacer's core devs at the moment. - 'TEST_EXTRACTORS_BUCKET', + # This is required for S3 unit tests. 'TEST_BUCKET', + # These are required to run certain unit tests. They're also only usable + # by CoralNet devs at the moment. + 'CN_TEST_EXTRACTORS_BUCKET', + 'CN_FIXTURES_BUCKET', # These can just be configured as needed, or left as defaults. 'LOG_DESTINATION', 'LOG_LEVEL', diff --git a/spacer/extractors/__init__.py b/spacer/extractors/__init__.py new file mode 100644 index 00000000..781ae696 --- /dev/null +++ b/spacer/extractors/__init__.py @@ -0,0 +1,14 @@ +""" +For possible future network extension +""" +from .base import DummyExtractor, FeatureExtractor +from .efficientnet import EfficientNetExtractor +from .vgg16 import VGG16CaffeExtractor + + +__all__ = [ + 'DummyExtractor', + 'EfficientNetExtractor', + 'FeatureExtractor', + 'VGG16CaffeExtractor', +] diff --git a/spacer/extract_features.py b/spacer/extractors/base.py similarity index 69% rename from spacer/extract_features.py rename to spacer/extractors/base.py index 9d936b30..ec2fd496 100644 --- a/spacer/extract_features.py +++ b/spacer/extractors/base.py @@ -17,10 +17,9 @@ from spacer import config from spacer.data_classes import PointFeatures, ImageFeatures from spacer.exceptions import ConfigError, HashMismatchError -from spacer.extract_features_utils import crop_patches +from spacer.extractors.utils import crop_patches from spacer.messages import DataLocation, ExtractFeaturesReturnMsg from spacer.storage import storage_factory -from spacer.torch_utils import extract_feature class FeatureExtractor(abc.ABC): @@ -28,6 +27,8 @@ class FeatureExtractor(abc.ABC): # Subclasses should define their expected data_locations keys here. # See __init__() for an explanation of data_locations. DATA_LOCATION_KEYS: list[str] = [] + # Width and height of cropped patches. + CROP_SIZE = 224 def __init__(self, data_locations: dict[str, DataLocation], @@ -49,12 +50,51 @@ def __init__(self, # ones you want to check the integrity of. self.data_hashes = data_hashes or dict() - @abc.abstractmethod def __call__(self, im: Image, rowcols: list[tuple[int, int]]) \ -> tuple[ImageFeatures, ExtractFeaturesReturnMsg]: - """ Runs the feature extraction """ + """ + Runs feature extraction on a single image. rowcols specifies the + pixel locations at which to crop square patches, and then features + are computed for each patch. + """ + start_time = time.time() + + # Crop patches + with config.log_entry_and_exit('cropping of {} patches'.format( + len(rowcols))): + patch_list = crop_patches(im, rowcols, self.CROP_SIZE) + del im + + feats, extractor_loaded_remotely = self.patches_to_features( + patch_list) + + image_features = ImageFeatures( + point_features=[ + PointFeatures(row=rc[0], col=rc[1], data=ft) + for rc, ft in zip(rowcols, feats) + ], + valid_rowcol=True, + feature_dim=len(feats[0]), + npoints=len(feats), + ) + return_msg = ExtractFeaturesReturnMsg( + extractor_loaded_remotely=extractor_loaded_remotely, + runtime=time.time() - start_time, + ) + return image_features, return_msg + + def patches_to_features( + self, patch_list: list[Image]) -> tuple[list, bool]: + """ + Extract features from cropped patches. + :param patch_list: a list of square images of size CROP_SIZE + :return: list of extracted features, one per patch; and a bool + saying whether or not the extractor params were loaded from a + remote source. + """ + raise NotImplementedError @property @abc.abstractmethod @@ -218,8 +258,8 @@ def load_datastream(self, key: str) -> tuple[BytesIO, bool]: class DummyExtractor(FeatureExtractor): """ - This doesn't actually extract any features from the image, - it just returns dummy information. + This doesn't actually extract any features from the image; + it just returns dummy information in the correct format. """ def __init__(self, data_locations: dict[str, DataLocation] = None, @@ -234,14 +274,17 @@ def __init__(self, def __call__(self, im, rowcols): return ImageFeatures( - point_features=[PointFeatures(row=rc[0], - col=rc[1], - data=[random.random() for _ in - range(self.feature_dim)]) - for rc in rowcols], + point_features=[ + PointFeatures( + row=rc[0], + col=rc[1], + data=[random.random() for _ in range(self.feature_dim)], + ) + for rc in rowcols + ], valid_rowcol=True, + feature_dim=self.feature_dim, npoints=len(rowcols), - feature_dim=self.feature_dim ), ExtractFeaturesReturnMsg.example() @property @@ -252,104 +295,3 @@ def serialize(self) -> dict: data = super().serialize() data['feature_dim'] = self.feature_dim return data - - -class VGG16CaffeExtractor(FeatureExtractor): - - # definition should be a Caffe prototxt file, typically .prototxt - # weights should be a Caffe model file, typically .caffemodel - DATA_LOCATION_KEYS = ['definition', 'weights'] - - def __call__(self, im, rowcols): - if not config.HAS_CAFFE: - raise ConfigError( - f"Need Caffe installed to call" - f" {self.__class__.__name__}.") - - # We should only reach this line if it is confirmed caffe is available - from spacer.caffe_utils import classify_from_patchlist - - start_time = time.time() - - # Set caffe parameters - caffe_params = {'im_mean': [128, 128, 128], - 'scaling_method': 'scale', - 'crop_size': 224, - 'batch_size': 10} - - # Crop patches - with config.log_entry_and_exit('cropping of {} patches'.format( - len(rowcols))): - patch_list = crop_patches(im, rowcols, caffe_params['crop_size']) - del im - - # Extract features - definition_filepath, _ = \ - self.load_data_into_filesystem('definition') - weights_filepath, remote_loaded = \ - self.load_data_into_filesystem('weights') - - feats = classify_from_patchlist(patch_list, - caffe_params, - definition_filepath, - weights_filepath, - scorelayer='fc7') - - return \ - ImageFeatures( - point_features=[PointFeatures(row=rc[0], - col=rc[1], - data=ft.tolist()) - for rc, ft in zip(rowcols, feats)], - valid_rowcol=True, - feature_dim=len(feats[0]), - npoints=len(feats) - ), ExtractFeaturesReturnMsg( - extractor_loaded_remotely=remote_loaded, - runtime=time.time() - start_time - ) - - @property - def feature_dim(self): - return 4096 - - -class EfficientNetExtractor(FeatureExtractor): - - # weights should be a PyTorch tensor file, typically .pt - DATA_LOCATION_KEYS = ['weights'] - - def __call__(self, im, rowcols): - - start_time = time.time() - - weights_datastream, remote_loaded = self.load_datastream('weights') - - # Set torch parameters - torch_params = {'model_type': 'efficientnet', - 'model_name': 'efficientnet-b0', - 'weights_datastream': weights_datastream, - 'num_class': 1275, - 'crop_size': 224, - 'batch_size': 10} - - # Crop patches - with config.log_entry_and_exit('cropping %s patches' % len(rowcols)): - patch_list = crop_patches(im, rowcols, torch_params['crop_size']) - del im - - # Extract features - feats = extract_feature(patch_list, torch_params) - - return ImageFeatures( - point_features=[PointFeatures(row=rc[0], col=rc[1], data=ft) - for rc, ft in zip(rowcols, feats)], - valid_rowcol=True, feature_dim=len(feats[0]), npoints=len(feats) - ), ExtractFeaturesReturnMsg( - extractor_loaded_remotely=remote_loaded, - runtime=time.time() - start_time - ) - - @property - def feature_dim(self): - return 1280 diff --git a/spacer/models/efficientnet.py b/spacer/extractors/efficientnet.py similarity index 93% rename from spacer/models/efficientnet.py rename to spacer/extractors/efficientnet.py index edc66f9b..07692229 100644 --- a/spacer/models/efficientnet.py +++ b/spacer/extractors/efficientnet.py @@ -1,13 +1,17 @@ """ Adapted from https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/model.py + +Note that EfficientNet has since been included as part of torchvision, +although the implementation's at least partly based on lukemelas' work: +https://github.com/pytorch/vision/pull/4293 """ import torch from torch import nn from torch.nn import functional as F -from .effcientnet_utils import ( +from .efficientnet_utils import ( relu_fn, round_filters, round_repeats, @@ -16,6 +20,7 @@ get_model_params, efficientnet_params, ) +from .torch_extractors import TorchExtractor class MBConvBlock(nn.Module): @@ -246,3 +251,20 @@ def _check_model_name_is_valid(cls, model_name): raise ValueError('model_name should be one of: ' + ', '.join( valid_models )) + + +class EfficientNetExtractor(TorchExtractor): + + MODEL_NAME = 'efficientnet-b0' + NUM_CLASSES = 1275 + + @property + def feature_dim(self): + return 1280 + + @classmethod + def untrained_model(cls) -> torch.nn.Module: + return EfficientNet.from_pretrained( + model_name=cls.MODEL_NAME, + num_classes=cls.NUM_CLASSES, + ) diff --git a/spacer/models/effcientnet_utils.py b/spacer/extractors/efficientnet_utils.py similarity index 100% rename from spacer/models/effcientnet_utils.py rename to spacer/extractors/efficientnet_utils.py diff --git a/spacer/extractors/torch_extractors.py b/spacer/extractors/torch_extractors.py new file mode 100644 index 00000000..de8782c0 --- /dev/null +++ b/spacer/extractors/torch_extractors.py @@ -0,0 +1,123 @@ +""" +This file contains a set of pytorch utility functions +""" + +from __future__ import annotations +import abc +from collections import OrderedDict +from io import BytesIO + +import numpy as np +import torch +from torchvision import transforms + +from spacer import config +from spacer.messages import DataLocation +from spacer.storage import storage_factory +from .base import FeatureExtractor + + +def transformation(): + """ + Transform an image or numpy array and normalize to [0, 1] + :return: transformer which takes in a image and return a normalized tensor + """ + + transformer = transforms.Compose([ + transforms.ToTensor(), + ]) + return transformer + + +class TorchExtractor(FeatureExtractor, abc.ABC): + + # weights should be a PyTorch tensor file, typically .pt + DATA_LOCATION_KEYS = ['weights'] + BATCH_SIZE = 10 + + def patches_to_features(self, patch_list): + + # Load pretrained weights + weights_datastream, extractor_loaded_remotely = ( + self.load_datastream('weights')) + net = self.load_weights(weights_datastream) + net.eval() + + transformer = transformation() + + # Feed forward and extract features + batch_size = self.BATCH_SIZE + num_batches = int(np.ceil(len(patch_list) / batch_size)) + feats_list = [] + with config.log_entry_and_exit('forward pass through net'): + for b in range(num_batches): + this_batch_size = min( + len(patch_list[b*batch_size:]), batch_size) + batch = patch_list[ + b*batch_size:(b*batch_size + this_batch_size)] + batch = torch.stack([transformer(i) for i in batch]) + with torch.no_grad(): + features = net.extract_features(batch) + feats_list.extend(features.tolist()) + + return features, extractor_loaded_remotely + + @staticmethod + def untrained_model() -> torch.nn.Module: + """ + Return a model that's been initialized with the intended + parameters, but not trained at all yet. + """ + raise NotImplementedError + + @classmethod + def load_weights(cls, + weights_datastream: BytesIO) -> torch.nn.Module: + """ + Load model weights, which may have been saved with DataParallel + (as CoralNet 1.0's weights were). + Create new OrderedDict that does not contain `module`. + :param weights_datastream: model weights, already loaded from storage + :return: well trained model + """ + # Use GPU if available + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + model = cls.untrained_model() + + # Load weights + state_dicts = torch.load(weights_datastream, + map_location=device) + + with config.log_entry_and_exit('model initialization'): + new_state_dicts = OrderedDict() + for k, v in state_dicts['net'].items(): + # Instead of e.g. `module._conv_stem.weight`, + # strip that first part to get `_conv_stem.weight`. + # The `module.` prefix comes from saving the weights with + # DataParallel. + # https://github.com/pytorch/pytorch/issues/9176 + if k.startswith('module.'): + k = k[7:] + new_state_dicts[k] = v + model.load_state_dict(new_state_dicts) + + for param in model.parameters(): + param.requires_grad = False + return model + + @classmethod + def untrained_instance(cls, data_loc_key='weights'): + """ + This is an EfficientNetExtractor that we can use without fixtures, + when we only care about valid format and not 'legit' + features / predictions (no training epochs are actually run on the + extractor). + """ + model = cls.untrained_model() + state = dict(net=model.state_dict()) + with BytesIO() as stream: + torch.save(state, stream) + storage_factory('memory').store(data_loc_key, stream) + weights_loc = DataLocation('memory', data_loc_key) + return cls(data_locations=dict(weights=weights_loc)) diff --git a/spacer/extract_features_utils.py b/spacer/extractors/utils.py similarity index 100% rename from spacer/extract_features_utils.py rename to spacer/extractors/utils.py diff --git a/spacer/extractors/vgg16.py b/spacer/extractors/vgg16.py new file mode 100644 index 00000000..5198a1ab --- /dev/null +++ b/spacer/extractors/vgg16.py @@ -0,0 +1,128 @@ +""" +This file contains a set of caffe utility functions copied into this repo for +simplicity. Since support for Caffe will be deprecate, +these are only lightly cleaned up from their original state. +""" + +from __future__ import annotations +from copy import copy +from functools import lru_cache +from typing import Any + +import numpy as np + +from spacer import config +from spacer.exceptions import ConfigError +from .base import FeatureExtractor + + +class Transformer: + """ + Transformer is a class for preprocessing and deprocessing images + according to the vgg16 pre-processing paradigm. + (scaling and mean subtraction.). + """ + + def __init__(self, mean: tuple = (0, 0, 0)) -> None: + self.mean = np.array(mean, dtype=np.float32) + self.scale = 1.0 + + def preprocess(self, im: np.ndarray) -> np.ndarray: + """ + preprocess() emulate the pre-processing occurring + in the vgg16 caffe prototxt. + :param im: numpy array. + :return: normalized numpy array. + """ + im = np.float32(im) + im = im[:, :, ::-1] # change to BGR + im -= self.mean + im *= self.scale + im = im.transpose((2, 0, 1)) + + return im + + def deprocess(self, im: np.ndarray) -> np.ndarray: + """ + inverse of preprocess(). + :param im: normalized numpy array. + :return: original image. + """ + im = im.transpose((1, 2, 0)) + im /= self.scale + im += self.mean + im = im[:, :, ::-1] # change to RGB + + return np.uint8(im) + + +class VGG16CaffeExtractor(FeatureExtractor): + + # definition should be a Caffe prototxt file, typically .prototxt + # weights should be a Caffe model file, typically .caffemodel + DATA_LOCATION_KEYS = ['definition', 'weights'] + + BATCH_SIZE = 10 + # Name of first convolutional layer. + START_LAYER = 'conv1_1' + # Name of the score (the last conv) layer. + SCORE_LAYER = 'fc7' + + def __call__(self, im, rowcols): + if not config.HAS_CAFFE: + raise ConfigError( + f"Need Caffe installed to call" + f" {self.__class__.__name__}.") + + return super().__call__(im, rowcols) + + def patches_to_features(self, patch_list): + # Load pretrained weights + definition_filepath, _ = ( + self.load_data_into_filesystem('definition')) + weights_filepath, extractor_loaded_remotely = ( + self.load_data_into_filesystem('weights')) + net = load_net(definition_filepath, weights_filepath) + + # Extract features. + # Although the below code is somewhat network-agnostic, it's only + # meant for classification nets (not FCNs). + + transformer = Transformer((128, 128, 128)) + + with config.log_entry_and_exit('forward pass through net'): + features = [] + for b in range(len(patch_list) // self.BATCH_SIZE + 1): + for i in range(self.BATCH_SIZE): + pos = b * self.BATCH_SIZE + i + if pos < len(patch_list): + net.blobs['data'].data[i, :, :, :] = \ + transformer.preprocess(patch_list[pos]) + net.forward(start=self.START_LAYER) + features.extend(list( + copy(net.blobs[self.SCORE_LAYER].data).astype(float) + )) + + features = features[:len(patch_list)] + + features = [feat.tolist() for feat in features] + return features, extractor_loaded_remotely + + @property + def feature_dim(self): + return 4096 + + +@lru_cache(maxsize=1) +def load_net(modeldef_path: str, + modelweights_path: str) -> Any: + """ + Load pretrained net. + :param modeldef_path: model path. + :param modelweights_path: pretrained weights path. + :return: pretrained model. + """ + # Should have checked for a Caffe installation before reaching this. + import caffe + caffe.set_mode_cpu() + return caffe.Net(modeldef_path, modelweights_path, caffe.TEST) diff --git a/spacer/messages.py b/spacer/messages.py index 486909c6..28200dfc 100644 --- a/spacer/messages.py +++ b/spacer/messages.py @@ -101,7 +101,7 @@ def __init__(self, @classmethod def example(cls) -> 'ExtractFeaturesMsg': - from spacer.extract_features import EfficientNetExtractor + from spacer.extractors import EfficientNetExtractor return ExtractFeaturesMsg( job_token='123abc', extractor=EfficientNetExtractor( @@ -125,7 +125,7 @@ def serialize(self) -> dict: @classmethod def deserialize(cls, data: dict) -> 'ExtractFeaturesMsg': - from spacer.extract_features import FeatureExtractor + from spacer.extractors import FeatureExtractor return ExtractFeaturesMsg( job_token=data['job_token'], extractor=FeatureExtractor.deserialize(data['extractor']), @@ -433,7 +433,7 @@ def __init__(self, @classmethod def example(cls): - from spacer.extract_features import EfficientNetExtractor + from spacer.extractors import EfficientNetExtractor return ClassifyImageMsg( job_token='my_job', extractor=EfficientNetExtractor( @@ -461,7 +461,7 @@ def serialize(self): @classmethod def deserialize(cls, data: dict) -> 'ClassifyImageMsg': - from spacer.extract_features import FeatureExtractor + from spacer.extractors import FeatureExtractor return ClassifyImageMsg( job_token=data['job_token'], extractor=FeatureExtractor.deserialize(data['extractor']), diff --git a/spacer/models/__init__.py b/spacer/models/__init__.py deleted file mode 100644 index bc5f31d7..00000000 --- a/spacer/models/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -""" -For possible future network extension -""" -from .efficientnet import EfficientNet - - -def get_model(model_type, model_name, num_classes, pretrained=False): - if model_type == 'efficientnet': - model = EfficientNet.from_pretrained(model_name=model_name, - num_classes=num_classes) - else: - raise NotImplementedError(model_name) - return model diff --git a/spacer/tests/common.py b/spacer/tests/common.py index 124abfa4..5a3bf796 100644 --- a/spacer/tests/common.py +++ b/spacer/tests/common.py @@ -8,17 +8,17 @@ # error if the S3 bucket isn't available.) TEST_EXTRACTORS = { 'vgg16': dict( - class_path='spacer.extract_features.VGG16CaffeExtractor', + class_path='spacer.extractors.VGG16CaffeExtractor', data_locations=dict( definition=dict( storage_type='s3', key='vgg16_coralnet_ver1.deploy.prototxt', - bucket_name=config.TEST_EXTRACTORS_BUCKET, + bucket_name=config.CN_TEST_EXTRACTORS_BUCKET, ), weights=dict( storage_type='s3', key='vgg16_coralnet_ver1.caffemodel', - bucket_name=config.TEST_EXTRACTORS_BUCKET, + bucket_name=config.CN_TEST_EXTRACTORS_BUCKET, ), ), data_hashes=dict( @@ -29,12 +29,12 @@ ), ), 'efficientnet-b0': dict( - class_path='spacer.extract_features.EfficientNetExtractor', + class_path='spacer.extractors.EfficientNetExtractor', data_locations=dict( weights=dict( storage_type='s3', key='efficientnet_b0_ver1.pt', - bucket_name=config.TEST_EXTRACTORS_BUCKET, + bucket_name=config.CN_TEST_EXTRACTORS_BUCKET, ), ), data_hashes=dict( diff --git a/spacer/tests/decorators.py b/spacer/tests/decorators.py index cea1d0f6..e378159a 100644 --- a/spacer/tests/decorators.py +++ b/spacer/tests/decorators.py @@ -6,10 +6,14 @@ require_caffe = unittest.skipUnless( config.HAS_CAFFE, "Requires Caffe to be installed") -require_test_extractors = unittest.skipUnless( - config.TEST_EXTRACTORS_BUCKET, - "Requires access to the test feature-extractors on S3") +require_cn_test_extractors = unittest.skipUnless( + config.CN_TEST_EXTRACTORS_BUCKET, + "Requires access to the test feature-extractors on CoralNet's S3") -require_test_fixtures = unittest.skipUnless( +require_cn_fixtures = unittest.skipUnless( + config.CN_FIXTURES_BUCKET, + "Requires access to the test fixtures on CoralNet's S3") + +require_s3 = unittest.skipUnless( config.TEST_BUCKET, - "Requires access to the test fixtures on S3") + "Requires write access to an S3 bucket for use in tests") diff --git a/spacer/tests/extractors/__init__.py b/spacer/tests/extractors/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/spacer/tests/test_extract_features.py b/spacer/tests/extractors/test_base.py similarity index 62% rename from spacer/tests/test_extract_features.py rename to spacer/tests/extractors/test_base.py index 17ac6e35..ffbe630d 100644 --- a/spacer/tests/test_extract_features.py +++ b/spacer/tests/extractors/test_base.py @@ -1,3 +1,5 @@ +from contextlib import contextmanager +import hashlib from io import BytesIO import unittest @@ -7,13 +9,14 @@ from spacer import config from spacer.data_classes import ImageFeatures from spacer.exceptions import HashMismatchError -from spacer.extract_features import \ - DummyExtractor, EfficientNetExtractor, FeatureExtractor +from spacer.extractors import ( + DummyExtractor, EfficientNetExtractor, FeatureExtractor) from spacer.messages import ExtractFeaturesReturnMsg, DataLocation from spacer.storage import load_image, storage_factory -from .common import TEST_EXTRACTORS -from .decorators import \ - require_caffe, require_test_extractors, require_test_fixtures +from ..common import TEST_EXTRACTORS +from ..decorators import ( + require_caffe, require_cn_fixtures, require_s3, require_cn_test_extractors) +from ..utils import random_image, temp_s3_filepaths class TestDummyExtractor(unittest.TestCase): @@ -71,11 +74,7 @@ def setUp(self): def do_test_simple(self): - img = load_image(DataLocation( - storage_type='s3', - key='edinburgh3.jpg', - bucket_name=config.TEST_BUCKET, - )) + img = random_image(800, 600) features, return_msg = self.extractor( im=img, rowcols=[(100, 100)], @@ -109,7 +108,7 @@ def do_test_corner_case1(self): img = load_image(DataLocation( storage_type='s3', key='kh6dydiix0.jpeg', - bucket_name=config.TEST_BUCKET, + bucket_name=config.CN_FIXTURES_BUCKET, )) features, return_msg = self.extractor( im=img, @@ -133,7 +132,7 @@ def do_test_corner_case2(self): img = load_image(DataLocation( storage_type='s3', key='sfq2mr5qbs.jpeg', - bucket_name=config.TEST_BUCKET, + bucket_name=config.CN_FIXTURES_BUCKET, )) features, return_msg = self.extractor( im=img, @@ -150,52 +149,6 @@ def do_test_corner_case2(self): len(features.point_features[0].data), self.expected_feature_dimension) - def do_test_regression(self, legacy_features_s3_key): - """ - This test runs the extractor on a known image and compares the - results to features extracted with - https://github.com/beijbom/ecs_spacer/releases/tag/1.0 - """ - rowcols = [(20, 265), - (76, 295), - (59, 274), - (151, 62), - (265, 234)] - - img = load_image(DataLocation( - storage_type='s3', - key='08bfc10v7t.png', - bucket_name=config.TEST_BUCKET, - )) - features_new, _ = self.extractor( - im=img, - rowcols=rowcols, - ) - - legacy_feat_loc = DataLocation(storage_type='s3', - key=legacy_features_s3_key, - bucket_name=config.TEST_BUCKET) - features_legacy = ImageFeatures.load(legacy_feat_loc) - - self.assertFalse(features_legacy.valid_rowcol) - self.assertEqual(features_legacy.npoints, len(rowcols)) - self.assertEqual( - features_legacy.feature_dim, - self.expected_feature_dimension) - - self.assertTrue(features_new.valid_rowcol) - self.assertEqual(features_new.npoints, len(rowcols)) - self.assertEqual( - features_new.feature_dim, - self.expected_feature_dimension) - - for pf_new, pf_legacy in zip(features_new.point_features, - features_legacy.point_features): - self.assertTrue(np.allclose(pf_legacy.data, pf_new.data, - atol=1e-5)) - self.assertTrue(pf_legacy.row is None) - self.assertTrue(pf_new.row is not None) - def do_test_image_mode(self, mode): """ Test an image of a particular color mode. @@ -235,8 +188,7 @@ def do_test_image_mode(self, mode): @require_caffe -@require_test_extractors -@require_test_fixtures +@require_cn_test_extractors class TestCaffeExtractor(BaseExtractorTest): expected_feature_dimension = 4096 @@ -251,15 +203,14 @@ def test_simple(self): def test_dims(self): super().do_test_dims() + @require_cn_fixtures def test_corner_case1(self): super().do_test_corner_case1() + @require_cn_fixtures def test_corner_case2(self): super().do_test_corner_case2() - def test_regression(self): - super().do_test_regression('08bfc10v7t.png.featurevector') - def test_rgb_mode(self): super().do_test_image_mode('RGB') @@ -273,16 +224,13 @@ def test_la_mode(self): super().do_test_image_mode('LA') -@require_test_extractors -@require_test_fixtures class TestEfficientNetExtractor(BaseExtractorTest): expected_feature_dimension = 1280 @classmethod def setUpClass(cls): - cls.extractor = FeatureExtractor.deserialize( - TEST_EXTRACTORS['efficientnet-b0']) + cls.extractor = EfficientNetExtractor.untrained_instance() def test_simple(self): super().do_test_simple() @@ -290,15 +238,14 @@ def test_simple(self): def test_dims(self): super().do_test_dims() + @require_cn_fixtures def test_corner_case1(self): super().do_test_corner_case1() + @require_cn_fixtures def test_corner_case2(self): super().do_test_corner_case2() - def test_regression(self): - super().do_test_regression('08bfc10v7t.png.effnet.ver1.featurevector') - def test_rgb_mode(self): super().do_test_image_mode('RGB') @@ -312,6 +259,80 @@ def test_la_mode(self): super().do_test_image_mode('LA') +@require_cn_fixtures +@require_cn_test_extractors +class TestRegression(unittest.TestCase): + + def do_test(self, extractor, legacy_features_s3_key, expected_feature_dim): + """ + This test runs the extractor on a known image and compares the + results to features extracted with + https://github.com/beijbom/ecs_spacer/releases/tag/1.0 + """ + rowcols = [(20, 265), + (76, 295), + (59, 274), + (151, 62), + (265, 234)] + + img = load_image(DataLocation( + storage_type='s3', + key='08bfc10v7t.png', + bucket_name=config.CN_FIXTURES_BUCKET, + )) + features_new, _ = extractor( + im=img, + rowcols=rowcols, + ) + + legacy_feat_loc = DataLocation(storage_type='s3', + key=legacy_features_s3_key, + bucket_name=config.CN_FIXTURES_BUCKET) + features_legacy = ImageFeatures.load(legacy_feat_loc) + + self.assertFalse(features_legacy.valid_rowcol) + self.assertEqual(features_legacy.npoints, len(rowcols)) + self.assertEqual( + features_legacy.feature_dim, + expected_feature_dim) + + self.assertTrue(features_new.valid_rowcol) + self.assertEqual(features_new.npoints, len(rowcols)) + self.assertEqual( + features_new.feature_dim, + expected_feature_dim) + + for pf_new, pf_legacy in zip(features_new.point_features, + features_legacy.point_features): + self.assertTrue(np.allclose(pf_legacy.data, pf_new.data, + atol=1e-5)) + self.assertTrue(pf_legacy.row is None) + self.assertTrue(pf_new.row is not None) + + @require_caffe + def test_vgg16(self): + self.do_test( + FeatureExtractor.deserialize(TEST_EXTRACTORS['vgg16']), + '08bfc10v7t.png.featurevector', + 4096, + ) + + def test_efficientnet(self): + self.do_test( + FeatureExtractor.deserialize(TEST_EXTRACTORS['efficientnet-b0']), + '08bfc10v7t.png.effnet.ver1.featurevector', + 1280, + ) + + +class SampleExtractor(FeatureExtractor): + DATA_LOCATION_KEYS = ['weights_1', 'weights_2'] + + @property + def feature_dim(self): + return 1280 + + class TestExtractorLoad(unittest.TestCase): @classmethod @@ -319,101 +340,131 @@ def setUpClass(cls): cls.file_storage = storage_factory('filesystem') cls.memory_storage = storage_factory('memory') - @require_test_extractors + @contextmanager + def s3_extractor(self): + with temp_s3_filepaths(config.TEST_BUCKET, 2) as s3_filepaths: + data_locations = dict( + weights_1=DataLocation( + 's3', s3_filepaths[0], bucket_name=config.TEST_BUCKET), + weights_2=DataLocation( + 's3', s3_filepaths[1], bucket_name=config.TEST_BUCKET), + ) + + extractor_s3_storage = storage_factory('s3', config.TEST_BUCKET) + extractor_s3_storage.store( + data_locations['weights_1'].key, BytesIO(b'sample content')) + extractor_s3_storage.store( + data_locations['weights_2'].key, BytesIO(b'sample content 2')) + + data_hashes = dict( + # Each hash is 64 hex digits + weights_1=hashlib.sha256(b'sample content').hexdigest(), + weights_2=hashlib.sha256(b'sample content 2').hexdigest(), + ) + + extractor = SampleExtractor( + data_locations=data_locations, data_hashes=data_hashes) + + yield extractor + + @require_s3 def test_remote_filesystem_load(self): """ Extractor caching only happens for extractors downloaded remotely (from S3 or URL). - We test with the VGG16 definition file because that's the - smallest of the test-extractor files, and thus quickest - to download. """ - extractor = FeatureExtractor.deserialize(TEST_EXTRACTORS['vgg16']) - key = 'definition' - - extractor.decache_remote_loaded_file(key) - filepath_for_cache = str(extractor.data_filepath_for_cache(key)) - self.assertFalse( - self.file_storage.exists(filepath_for_cache), - msg="decache call should've worked") - - # Test cache miss. - filepath_loaded, remote_loaded = \ - extractor.load_data_into_filesystem(key) - self.assertTrue(remote_loaded) - self.assertTrue( - self.file_storage.exists(filepath_loaded), - msg="Should be loaded into cache after a cache miss") - self.assertEqual(filepath_for_cache, filepath_loaded) - - # Test cache hit. - _, remote_loaded = extractor.load_data_into_filesystem(key) - self.assertFalse(remote_loaded) - - @require_test_extractors + with self.s3_extractor() as extractor: + key = 'weights_1' + + # Ensure the file's uncached. + extractor.decache_remote_loaded_file(key) + filepath_for_cache = str(extractor.data_filepath_for_cache(key)) + self.assertFalse( + self.file_storage.exists(filepath_for_cache), + msg="Should not be in cache") + + # Test cache miss. + filepath_loaded, remote_loaded = \ + extractor.load_data_into_filesystem(key) + self.assertTrue(remote_loaded) + self.assertTrue( + self.file_storage.exists(filepath_loaded), + msg="Should be loaded into cache after a cache miss") + self.assertEqual(filepath_for_cache, filepath_loaded) + + # Test cache hit. + _, remote_loaded = extractor.load_data_into_filesystem(key) + self.assertFalse(remote_loaded) + + @require_s3 def test_remote_datastream_load(self): - extractor = FeatureExtractor.deserialize(TEST_EXTRACTORS['vgg16']) - key = 'definition' - - extractor.decache_remote_loaded_file(key) - filepath_for_cache = str(extractor.data_filepath_for_cache(key)) - - # Test cache miss. - datastream, remote_loaded = \ - extractor.load_datastream(key) - self.assertTrue(remote_loaded) - self.assertTrue( - self.file_storage.exists(filepath_for_cache), - msg="Should be loaded into cache after a cache miss") - self.assertEqual( - datastream.tell(), 0, - msg="datastream should be at the start of the file") - - # Test cache hit. - _, remote_loaded = extractor.load_data_into_filesystem(key) - self.assertFalse(remote_loaded) - - @require_test_extractors + with self.s3_extractor() as extractor: + key = 'weights_1' + + # Ensure the file's uncached. + extractor.decache_remote_loaded_file(key) + filepath_for_cache = str(extractor.data_filepath_for_cache(key)) + + # Test cache miss. + datastream, remote_loaded = \ + extractor.load_datastream(key) + self.assertTrue(remote_loaded) + self.assertTrue( + self.file_storage.exists(filepath_for_cache), + msg="Should be loaded into cache after a cache miss") + self.assertEqual( + datastream.tell(), 0, + msg="datastream should be at the start of the file") + + # Test cache hit. + _, remote_loaded = extractor.load_data_into_filesystem(key) + self.assertFalse(remote_loaded) + + @require_s3 def test_remote_hash_mismatch(self): - serialized_extractor = TEST_EXTRACTORS['vgg16'].copy() - serialized_extractor['data_hashes']['definition'] = '1'*64 - extractor = FeatureExtractor.deserialize(serialized_extractor) - key = 'definition' - - extractor.decache_remote_loaded_file(key) + with self.s3_extractor() as extractor: + key = 'weights_1' - with self.assertRaises(HashMismatchError): - extractor.load_datastream(key) + # Bogus hash. + extractor.data_hashes[key] = '1'*64 - filepath_for_cache = str(extractor.data_filepath_for_cache(key)) - self.assertFalse( - self.file_storage.exists(filepath_for_cache), - msg="Should not keep in cache after a hash mismatch") + # Ensure the file's uncached. + extractor.decache_remote_loaded_file(key) - @require_test_extractors - def test_remote_no_hash(self): - serialized_extractor = TEST_EXTRACTORS['vgg16'].copy() - del serialized_extractor['data_hashes']['definition'] - extractor = FeatureExtractor.deserialize(serialized_extractor) - key = 'definition' + with self.assertRaises(HashMismatchError): + extractor.load_datastream(key) - extractor.decache_remote_loaded_file(key) - filepath_for_cache = str(extractor.data_filepath_for_cache(key)) + filepath_for_cache = str(extractor.data_filepath_for_cache(key)) + self.assertFalse( + self.file_storage.exists(filepath_for_cache), + msg="Should not keep in cache after a hash mismatch") - # Test cache miss. - datastream, remote_loaded = \ - extractor.load_datastream(key) - self.assertTrue(remote_loaded) - self.assertTrue( - self.file_storage.exists(filepath_for_cache), - msg="Should be loaded into cache after a cache miss") - self.assertEqual( - datastream.tell(), 0, - msg="datastream should be at the start of the file") - - # Test cache hit. - _, remote_loaded = extractor.load_data_into_filesystem(key) - self.assertFalse(remote_loaded) + @require_s3 + def test_remote_no_hash(self): + with self.s3_extractor() as extractor: + key = 'weights_1' + + # Delete the hash. + del extractor.data_hashes[key] + + # Ensure the file's uncached. + extractor.decache_remote_loaded_file(key) + filepath_for_cache = str(extractor.data_filepath_for_cache(key)) + + # Test cache miss. + datastream, remote_loaded = \ + extractor.load_datastream(key) + self.assertTrue(remote_loaded) + self.assertTrue( + self.file_storage.exists(filepath_for_cache), + msg="Should be loaded into cache after a cache miss") + self.assertEqual( + datastream.tell(), 0, + msg="datastream should be at the start of the file") + + # Test cache hit. + _, remote_loaded = extractor.load_data_into_filesystem(key) + self.assertFalse(remote_loaded) def test_local(self): key = 'weights' @@ -425,10 +476,7 @@ def test_local(self): weights=DataLocation('memory', key), ), data_hashes=dict( - # This is the result of - # hashlib.sha256(b'test bytes').hexdigest() - weights='4be66ea6f5222861df37e88d4635bffb' - '99e183435f79fba13055b835b5dc420b', + weights=hashlib.sha256(b'test bytes').hexdigest(), ), ) diff --git a/spacer/tests/test_models.py b/spacer/tests/extractors/test_efficientnet.py similarity index 79% rename from spacer/tests/test_models.py rename to spacer/tests/extractors/test_efficientnet.py index 1d1f0962..8be3d338 100644 --- a/spacer/tests/test_models.py +++ b/spacer/tests/extractors/test_efficientnet.py @@ -3,8 +3,8 @@ import torch -from spacer import models -from spacer.models.effcientnet_utils import \ +from spacer.extractors import EfficientNetExtractor +from spacer.extractors.efficientnet_utils import \ round_filters, \ round_repeats, \ drop_connect, \ @@ -14,29 +14,21 @@ get_model_params -class TestGetModels(unittest.TestCase): - - def test_invalid_model(self): - with self.assertRaises(NotImplementedError): - _ = models.get_model(model_type='dummy', - model_name='dummy', - num_classes=1000) +class SampleExtractor(EfficientNetExtractor): + MODEL_NAME = 'efficientnet-b0' + NUM_CLASSES = 1000 class TestEfficientNet(unittest.TestCase): def test_efficientnet(self): - model_param = {'model_type': 'efficientnet', - 'model_name': 'efficientnet-b0', - 'num_classes': 1000} - net = models.get_model(model_type=model_param['model_type'], - model_name=model_param['model_name'], - num_classes=model_param['num_classes']) + net = SampleExtractor.untrained_model() with torch.no_grad(): output = net(torch.rand(1, 3, 224, 224)) - self.assertEqual(net.get_image_size(model_param['model_name']), 224) - self.assertEqual(output.shape[1], model_param['num_classes']) + self.assertEqual( + net.get_image_size(SampleExtractor.MODEL_NAME), 224) + self.assertEqual(output.shape[1], SampleExtractor.NUM_CLASSES) with self.assertRaises(ValueError): net._check_model_name_is_valid(model_name='dummy') diff --git a/spacer/tests/test_torch_utils.py b/spacer/tests/extractors/test_torch_extractors.py similarity index 55% rename from spacer/tests/test_torch_utils.py rename to spacer/tests/extractors/test_torch_extractors.py index b2aedf46..084b8a3f 100644 --- a/spacer/tests/test_torch_utils.py +++ b/spacer/tests/extractors/test_torch_extractors.py @@ -5,10 +5,8 @@ from PIL import Image from torchvision import transforms -from spacer.extract_features import FeatureExtractor -from spacer.torch_utils import extract_feature, transformation -from .common import TEST_EXTRACTORS -from .decorators import require_test_extractors +from spacer.extractors import EfficientNetExtractor +from spacer.extractors.torch_extractors import transformation class TestTransformation(unittest.TestCase): @@ -38,28 +36,19 @@ def test_transformer(self): self.assertTrue(np.allclose(output.numpy(), expected_output)) -@require_test_extractors -class TestExtractFeatures(unittest.TestCase): +class TestPatchesToFeatures(unittest.TestCase): @classmethod def setUpClass(cls): - cls.extractor = FeatureExtractor.deserialize( - TEST_EXTRACTORS['efficientnet-b0']) - - def test_rgb(self): - - weights_datastream, _ = self.extractor.load_datastream('weights') - torch_params = {'model_type': 'efficientnet', - 'model_name': 'efficientnet-b0', - 'weights_datastream': weights_datastream, - 'num_class': 1275, - 'crop_size': 224, - 'batch_size': 10} - patch_list = [np.array(Image.new('RGB', (224, 224))), - np.array(Image.new('RGB', (224, 224))), - np.array(Image.new('RGB', (224, 224)))] - feats = extract_feature(patch_list=patch_list, - pyparams=torch_params) + cls.extractor = EfficientNetExtractor.untrained_instance() + + def test(self): + crop_size = self.extractor.CROP_SIZE + patch_list = [np.array(Image.new('RGB', (crop_size, crop_size))), + np.array(Image.new('RGB', (crop_size, crop_size))), + np.array(Image.new('RGB', (crop_size, crop_size)))] + feats, remote_loaded = self.extractor.patches_to_features( + patch_list=patch_list) self.assertEqual(len(feats), len(patch_list)) self.assertEqual(len(feats[0]), 1280) diff --git a/spacer/tests/test_extract_features_utils.py b/spacer/tests/extractors/test_utils.py similarity index 94% rename from spacer/tests/test_extract_features_utils.py rename to spacer/tests/extractors/test_utils.py index d79e123e..6383d022 100644 --- a/spacer/tests/test_extract_features_utils.py +++ b/spacer/tests/extractors/test_utils.py @@ -2,7 +2,7 @@ from PIL import Image -from spacer.extract_features_utils import crop_patches +from spacer.extractors.utils import crop_patches class TestCropPatch(unittest.TestCase): diff --git a/spacer/tests/extractors/test_vgg16.py b/spacer/tests/extractors/test_vgg16.py new file mode 100644 index 00000000..9f34738b --- /dev/null +++ b/spacer/tests/extractors/test_vgg16.py @@ -0,0 +1,66 @@ +import time +import unittest + +import numpy as np +from PIL import Image + +from spacer import config +from spacer.extractors import FeatureExtractor +from spacer.extractors.vgg16 import load_net, Transformer +from ..common import TEST_EXTRACTORS +from ..decorators import require_caffe, require_cn_test_extractors + + +class TestTransformer(unittest.TestCase): + + def test_process(self): + trans = Transformer() + im_pil = Image.new('RGB', (50, 50)) + im_arr = np.asarray(im_pil) + im_arr2 = trans.deprocess(trans.preprocess(im_arr)) + self.assertTrue(np.array_equal(im_arr, im_arr2)) + + +@require_caffe +@require_cn_test_extractors +class TestVGG16CaffeExtractor(unittest.TestCase): + + @classmethod + def setUpClass(cls): + config.filter_warnings() + + cls.extractor = FeatureExtractor.deserialize(TEST_EXTRACTORS['vgg16']) + cls.definition_filepath, _ = \ + cls.extractor.load_data_into_filesystem('definition') + cls.weights_filepath, _ = \ + cls.extractor.load_data_into_filesystem('weights') + + def test_patches_to_features(self): + crop_size = self.extractor.CROP_SIZE + patch_list = [np.array(Image.new('RGB', (crop_size, crop_size))), + np.array(Image.new('RGB', (crop_size, crop_size))), + np.array(Image.new('RGB', (crop_size, crop_size)))] + feats, remote_loaded = self.extractor.patches_to_features( + patch_list=patch_list) + self.assertEqual(len(feats), len(patch_list)) + self.assertEqual(len(feats[0]), 4096) + + def test_load_net_caching(self): + """ + Call load_net() twice to check if the LRU caching + on that method works. + """ + # Clear cache to make sure it's not set from previous test. + load_net.cache_clear() + t0 = time.time() + _ = load_net(self.definition_filepath, self.weights_filepath) + t1 = time.time() - t0 + + t0 = time.time() + _ = load_net(self.definition_filepath, self.weights_filepath) + t2 = time.time() - t0 + self.assertLess(t2, t1) + + +if __name__ == '__main__': + unittest.main() diff --git a/spacer/tests/test_caffe_utils.py b/spacer/tests/test_caffe_utils.py deleted file mode 100644 index c9559d4c..00000000 --- a/spacer/tests/test_caffe_utils.py +++ /dev/null @@ -1,75 +0,0 @@ -import time -import unittest - -import numpy as np -from PIL import Image - -from spacer import config -from spacer.extract_features import FeatureExtractor -from .common import TEST_EXTRACTORS -from .decorators import require_caffe, require_test_extractors - - -@require_caffe -class TestTransformer(unittest.TestCase): - - def test_process(self): - from spacer.caffe_utils import Transformer - trans = Transformer() - im_pil = Image.new('RGB', (50, 50)) - im_arr = np.asarray(im_pil) - im_arr2 = trans.deprocess(trans.preprocess(im_arr)) - self.assertTrue(np.array_equal(im_arr, im_arr2)) - - -@require_caffe -@require_test_extractors -class TestClassifyFromPatchList(unittest.TestCase): - - @classmethod - def setUpClass(cls): - config.filter_warnings() - - extractor = FeatureExtractor.deserialize(TEST_EXTRACTORS['vgg16']) - cls.definition_filepath, _ = \ - extractor.load_data_into_filesystem('definition') - cls.weights_filepath, _ = \ - extractor.load_data_into_filesystem('weights') - - def test_rgb(self): - from spacer.caffe_utils import classify_from_patchlist - - caffe_params = {'im_mean': [128, 128, 128], - 'scaling_method': 'scale', - 'crop_size': 224, - 'batch_size': 10} - - feats = classify_from_patchlist( - [np.array(Image.new('RGB', (224, 224)))], - caffe_params, - self.definition_filepath, - self.weights_filepath, - scorelayer='fc7') - self.assertEqual(len(feats), 1) - self.assertEqual(len(feats[0]), 4096) - - def test_net_caching(self): - """ Call classify_from_patchlist twice to check if the LRU caching on - load_net method works - """ - from spacer.caffe_utils import load_net - - # Clear cache to make sure it's not set from previous test. - load_net.cache_clear() - t0 = time.time() - _ = load_net(self.definition_filepath, self.weights_filepath) - t1 = time.time() - t0 - - t0 = time.time() - _ = load_net(self.definition_filepath, self.weights_filepath) - t2 = time.time() - t0 - self.assertLess(t2, t1) - - -if __name__ == '__main__': - unittest.main() diff --git a/spacer/tests/test_data_classes.py b/spacer/tests/test_data_classes.py index cdeede4a..13a6c167 100644 --- a/spacer/tests/test_data_classes.py +++ b/spacer/tests/test_data_classes.py @@ -10,7 +10,7 @@ ImageLabels, \ ValResults from spacer.messages import DataLocation -from .decorators import require_test_fixtures +from .decorators import require_cn_fixtures, require_s3 from .utils import temp_filesystem_data_location @@ -38,11 +38,11 @@ def test_legacy(self): self.assertEqual(feats.npoints, len(feats.point_features)) self.assertEqual(feats.feature_dim, len(feats.point_features[0].data)) - @require_test_fixtures + @require_cn_fixtures def test_legacy_from_s3(self): legacy_feat_loc = DataLocation(storage_type='s3', key='08bfc10v7t.png.featurevector', - bucket_name=config.TEST_BUCKET) + bucket_name=config.CN_FIXTURES_BUCKET) feats = ImageFeatures.load(legacy_feat_loc) self.assertEqual(feats.valid_rowcol, False) @@ -65,7 +65,7 @@ def test_legacy_getitme(self): class TestImageFeaturesNumpyStore(unittest.TestCase): - @require_test_fixtures + @require_s3 def test_s3(self): s3_loc = DataLocation( storage_type='s3', @@ -177,11 +177,11 @@ def test_asserts(self): self.assertRaises(AssertionError, ValResults, gt=gt, est=est, scores=scores, classes=classes) - @require_test_fixtures + @require_cn_fixtures def test_legacy(self): legacy_loc = DataLocation(storage_type='s3', key='beta.valresult', - bucket_name=config.TEST_BUCKET) + bucket_name=config.CN_FIXTURES_BUCKET) res = ValResults.load(legacy_loc) self.assertEqual(res, ValResults.deserialize(json.loads( diff --git a/spacer/tests/test_legacy.py b/spacer/tests/test_legacy.py index d55994b8..897480c2 100644 --- a/spacer/tests/test_legacy.py +++ b/spacer/tests/test_legacy.py @@ -9,18 +9,17 @@ from spacer import config from spacer.data_classes import ImageFeatures -from spacer.extract_features import FeatureExtractor +from spacer.extractors import FeatureExtractor from spacer.messages import \ DataLocation, \ ExtractFeaturesMsg, \ ClassifyFeaturesMsg, \ ClassifyReturnMsg -from spacer.storage import storage_factory from spacer.tasks import classify_features, extract_features from spacer.tests.utils import cn_beta_fixture_location from .common import TEST_EXTRACTORS from .decorators import \ - require_caffe, require_test_extractors, require_test_fixtures + require_caffe, require_cn_fixtures, require_cn_test_extractors cn_beta_fixtures = { @@ -43,7 +42,7 @@ def pyspacer031_vgg16_fixture_location(key): return DataLocation( storage_type='s3', - bucket_name=config.TEST_BUCKET, + bucket_name=config.CN_FIXTURES_BUCKET, key='legacy_compat/pyspacer_0.3.1/vgg16/' + key ) @@ -51,7 +50,7 @@ def pyspacer031_vgg16_fixture_location(key): def pyspacer031_efficientnet_fixture_location(key): return DataLocation( storage_type='s3', - bucket_name=config.TEST_BUCKET, + bucket_name=config.CN_FIXTURES_BUCKET, key='legacy_compat/pyspacer_0.3.1/efficientnet/' + key ) @@ -84,8 +83,8 @@ def extract_and_classify(im_key, clf_key, rowcol): @require_caffe -@require_test_extractors -@require_test_fixtures +@require_cn_test_extractors +@require_cn_fixtures class TestExtractFeatures(unittest.TestCase): """ Test pyspacer's Caffe extractor and compare to features extracted using @@ -141,7 +140,7 @@ def test_png(self): atol=1e-5)) -@require_test_fixtures +@require_cn_fixtures class TestClassifyFeatures(unittest.TestCase): """ Get scores from the current classify_features task using previous @@ -223,8 +222,8 @@ def test_pyspacer_0_3_1_efficientnet(self): @require_caffe -@require_test_extractors -@require_test_fixtures +@require_cn_test_extractors +@require_cn_fixtures class TestExtractClassify(unittest.TestCase): """ Tests new feature extractor and a classification against legacy. Test passes if the same class is assigned in both cases for each @@ -232,7 +231,6 @@ class TestExtractClassify(unittest.TestCase): def setUp(self): config.filter_warnings() - self.storage = storage_factory('s3', config.TEST_BUCKET) def test_tricky_example(self): """ From regression testing, this particular row, col location diff --git a/spacer/tests/test_mailman.py b/spacer/tests/test_mailman.py index c6bbb11b..d622fda9 100644 --- a/spacer/tests/test_mailman.py +++ b/spacer/tests/test_mailman.py @@ -1,7 +1,7 @@ import unittest from spacer import config -from spacer.extract_features import DummyExtractor +from spacer.extractors import DummyExtractor from spacer.messages import ( ClassifyImageMsg, DataLocation, diff --git a/spacer/tests/test_storage.py b/spacer/tests/test_storage.py index b2d0c17c..d66c4d87 100644 --- a/spacer/tests/test_storage.py +++ b/spacer/tests/test_storage.py @@ -26,7 +26,7 @@ clear_memory_storage from spacer.tests.utils import cn_beta_fixture_location from spacer.train_utils import make_random_data, train -from .decorators import require_test_fixtures +from .decorators import require_cn_fixtures, require_s3 from .utils import temp_filesystem_data_location @@ -113,11 +113,11 @@ def setUpClass(cls): def s3_url(filepath): return ( 'https://' - f'{config.TEST_BUCKET}.s3-{config.AWS_REGION}.amazonaws.com/' + f'{config.CN_FIXTURES_BUCKET}.s3-{config.AWS_REGION}.amazonaws.com/' f'{filepath}' ) - @require_test_fixtures + @require_cn_fixtures def test_load_image(self): loc = DataLocation( storage_type='url', @@ -126,7 +126,7 @@ def test_load_image(self): img = load_image(loc) self.assertTrue(isinstance(img, Image.Image)) - @require_test_fixtures + @require_cn_fixtures def test_load_classifier(self): loc = DataLocation( storage_type='url', @@ -135,7 +135,7 @@ def test_load_classifier(self): clf = load_classifier(loc) self.assertTrue(isinstance(clf, CalibratedClassifierCV)) - @require_test_fixtures + @require_cn_fixtures def test_load_string(self): loc = DataLocation( storage_type='url', @@ -144,7 +144,7 @@ def test_load_string(self): feats = ImageFeatures.load(loc) self.assertTrue(isinstance(feats, ImageFeatures)) - @require_test_fixtures + @require_cn_fixtures def test_exists_true(self): self.assertTrue(self.storage.exists(self.s3_url('08bfc10v7t.png'))) @@ -214,7 +214,7 @@ def return_fake_response(*args, **kwargs): self.assertIn("full response", str(cm.exception)) -@require_test_fixtures +@require_s3 class TestS3Storage(BaseStorageTest): def setUp(self): @@ -250,6 +250,7 @@ def test_load_store_image(self): self.assertTrue(np.allclose(np.array(img), np.array(img2), atol=1e-5)) self.assertTrue(isinstance(img2, Image.Image)) + @require_cn_fixtures def test_load_legacy_features(self): feats = ImageFeatures.load( cn_beta_fixture_location('example.jpg.feats')) @@ -259,6 +260,7 @@ def test_load_legacy_features(self): def test_delete(self): self.do_test_delete() + @require_cn_fixtures def test_load_legacy_model(self): clf = load_classifier(cn_beta_fixture_location('example.model')) self.assertTrue(isinstance(clf, CalibratedClassifierCV)) diff --git a/spacer/tests/test_tasks.py b/spacer/tests/test_tasks.py index 7c8289a7..3603ec99 100644 --- a/spacer/tests/test_tasks.py +++ b/spacer/tests/test_tasks.py @@ -9,7 +9,7 @@ ImageFeatures, ImageLabels, LabelId, PointFeatures, ValResults) from spacer.exceptions import ( DataLimitError, RowColumnInvalidError, RowColumnMismatchError) -from spacer.extract_features import DummyExtractor +from spacer.extractors import DummyExtractor from spacer.messages import ( ClassifyFeaturesMsg, ClassifyImageMsg, @@ -38,7 +38,7 @@ from spacer.task_utils import preprocess_labels from spacer.tests.utils import cn_beta_fixture_location, temp_s3_filepaths from spacer.train_utils import make_random_data, train -from .decorators import require_test_fixtures +from .decorators import require_cn_fixtures, require_s3 TEST_URL = \ 'https://upload.wikimedia.org/wikipedia/commons/7/7b/Red_sea_coral_reef.jpg' @@ -346,7 +346,7 @@ def do_feature_caching_test(feature_cache_dir): return load_remote.mock_obj - @require_test_fixtures + @require_s3 def test_feature_caching_enabled(self): load_remote_mock = self.do_feature_caching_test( TrainClassifierMsg.FeatureCache.AUTO) @@ -355,7 +355,7 @@ def test_feature_caching_enabled(self): "Should go like: download ref, download train, cache-load" " train, download val") - @require_test_fixtures + @require_s3 def test_feature_caching_disabled(self): load_remote_mock = self.do_feature_caching_test( TrainClassifierMsg.FeatureCache.DISABLED) @@ -432,7 +432,7 @@ class TestClassifyFeatures(ClassifyReturnMsgTest): def setUp(self): config.filter_warnings() - @require_test_fixtures + @require_cn_fixtures def test_legacy(self): msg = ClassifyFeaturesMsg( job_token='my_job', @@ -443,7 +443,7 @@ def test_legacy(self): return_msg = classify_features(msg) self._validate_return_msg(return_msg, False) - @require_test_fixtures + @require_cn_fixtures def test_new(self): feats = ImageFeatures.make_random([1, 2, 3, 2], feature_dim=4096) @@ -494,7 +494,7 @@ class TestClassifyImage(ClassifyReturnMsgTest): def setUp(self): config.filter_warnings() - @require_test_fixtures + @require_cn_fixtures def test_deploy_simple(self): msg = ClassifyImageMsg( job_token='my_job', @@ -535,7 +535,7 @@ class TestClassifyImageCache(unittest.TestCase): def setUp(self): config.filter_warnings() - @require_test_fixtures + @require_cn_fixtures def test_classify_image_with_caching(self): """ Call classify_image three times. The first 2 time with same message. @@ -571,7 +571,7 @@ def test_classify_image_with_caching(self): class TestClassifyBadRowcols(unittest.TestCase): - @require_test_fixtures + @require_cn_fixtures def test_image_classify(self): msg = ClassifyImageMsg( job_token='my_job', diff --git a/spacer/tests/utils.py b/spacer/tests/utils.py index 275c2510..55083868 100644 --- a/spacer/tests/utils.py +++ b/spacer/tests/utils.py @@ -4,6 +4,9 @@ import tempfile import uuid +import numpy as np +from PIL import Image + from spacer import config from spacer.messages import DataLocation from spacer.storage import FileSystemStorage, S3Storage @@ -12,7 +15,7 @@ def cn_beta_fixture_location(key): return DataLocation( storage_type='s3', - bucket_name=config.TEST_BUCKET, + bucket_name=config.CN_FIXTURES_BUCKET, key='legacy_compat/coralnet_beta/' + key ) @@ -58,3 +61,11 @@ def temp_s3_filepaths( for filename in filenames: if storage.exists(filename): storage.delete(filename) + + +def random_image(width, height) -> Image: + """ + Source: https://stackoverflow.com/a/10901092 + """ + arr = np.random.rand(width, height, 3) * 255 + return Image.fromarray(arr.astype('uint8')).convert('RGB') diff --git a/spacer/torch_utils.py b/spacer/torch_utils.py deleted file mode 100644 index bb645146..00000000 --- a/spacer/torch_utils.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -This file contains a set of pytorch utility functions -""" - -from __future__ import annotations -from collections import OrderedDict -from io import BytesIO -from typing import Any - -import numpy as np -import torch -from torchvision import transforms - -from spacer import config -from spacer import models - - -def transformation(): - """ - Transform an image or numpy array and normalize to [0, 1] - :return: transformer which takes in a image and return a normalized tensor - """ - - transformer = transforms.Compose([ - transforms.ToTensor(), - ]) - return transformer - - -def load_weights(model: Any, - weights_datastream: BytesIO) -> Any: - """ - Load model weights, original weight saved with DataParallel - Create new OrderedDict that does not contain `module`. - :param model: Currently support EfficientNet - :param weights_datastream: model weights, already loaded from storage - :return: well trained model - """ - # Use GPU if available - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - # Load weights - state_dicts = torch.load(weights_datastream, - map_location=device) - - with config.log_entry_and_exit('model initialization'): - new_state_dicts = OrderedDict() - for k, v in state_dicts['net'].items(): - name = k[7:] - new_state_dicts[name] = v - model.load_state_dict(new_state_dicts) - - for param in model.parameters(): - param.requires_grad = False - return model - - -def extract_feature(patch_list: list, - pyparams: dict) -> list: - """ - Crop patches and extract features - :param patch_list: a list of cropped images - :param pyparams: parameter dict - :return: a list of features - """ - # Model setup and load pretrained weight - net = models.get_model(model_type=pyparams['model_type'], - model_name=pyparams['model_name'], - num_classes=pyparams['num_class']) - net = load_weights(net, pyparams['weights_datastream']) - net.eval() - - transformer = transformation() - - # Feed forward and extract features - bs = pyparams['batch_size'] - num_batch = int(np.ceil(len(patch_list) / bs)) - feats_list = [] - with config.log_entry_and_exit('forward pass through net'): - for b in range(num_batch): - batch = patch_list[b*bs: b*bs + min(len(patch_list[b*bs:]), bs)] - batch = torch.stack([transformer(i) for i in batch]) - with torch.no_grad(): - features = net.extract_features(batch) - feats_list.extend(features.tolist()) - - return feats_list