From 1c1914df24c211aea45b42146940c9a7b770cbee Mon Sep 17 00:00:00 2001 From: "Kamil A. Kaczmarek" Date: Thu, 19 Jul 2018 11:30:18 +0200 Subject: [PATCH] Dev solution 5 (#66) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * added parametrized loss weights * Update pipelines.py random code was pasted in pipelines * Update postprocessing.py bug-fix in crop/pad * Update postprocessing.py * two_unets * Update neptune.yaml dropped local data paths * pull request fixes * two specialist unets pipeline added * Update neptune.yaml * two unets pipeline added * Update pipeline_config.py added globals for specialists * corrections in the neptune.yaml * fixes for unet_specialists * Improve scoring (#54) * propose new (faster hopefully) method of counting score * Update metrics.py * corrections * Update callbacks.py hot-fixed averager update bug * Update utils.py submission generation fix * Bug fix in pipelines, assertion that checks outputs length added. (#62) * Bug fix in pipelines, assertion that checks outputs length added. * assertion message corrected * corrected order of elements in assertion * weighted segmentation loss added (#60) * weighted segmentation loss added * weighted segmentation loss added * formatting * Update neptune.yaml * Update pipeline_config.py * Update validation.py * names refactor * namig refactor * Update models.py * refactor * removed specialists, dropped contour_touching * dropped specialists and contour-touching * Update models.py * Dev patching (#61) * init * added new postpro * local * patching works * added test time augmentation * cropping bugs fixed * fixed callbacks volatile error, updated config, dropped debug from main * dropped loader pickling * added pad if smaller * added more augmentation to the patching seq * added mosaic padding to loaders updated augmentations * added dev mode, updated config, added specialists with patching * fixed mosaic loader bug * Update main.py dropped debug saving * updated postprocessing, added fixes to patching * updated postprocessing, added devmode, fixed loaders, changed mask preprocessing to get full masks and internal contours * fixed mosaic for larger patches, adjusted min blob size in postpro * pipelines with specialists and mulit with patching are working, dropped 0 channel load from loaders, minor fixes in loss def * added small random crop/pads, fixed pipelines for no patching mode, added simple validation mode * added artifact images to train * added global seeding * fixed checkersboard effect * added normalization * added blur to augmentations, added wireframe of scaling pipeline, reverted to vanila postprocessing * added trainable rescaling loop * fixed contour regeneration bug * refactored contour generation, upgraded contour generation in rescaling, cleaned pipelines * added dev and simple cv models, added caching to inference pipeline * added stain deconvolution * fixed image loading for grey images * fixed normalization of patches * moved stand alone notebooks to dir, dropped specialists, refactored pipelines * fixed pipelines updated configs * added kaggle notebooks, small refactor in pipelines, preprocessing clean up * Update augmentation.py * corrections in configs * imports optimized, removed plot_list function from utils.py * corrections * bug fix * added color_seq_RGB * Update neptune.yaml * drop_big_artifacts (#67) * Dev external data (#68) * init * added new postpro * local * patching works * added test time augmentation * cropping bugs fixed * fixed callbacks volatile error, updated config, dropped debug from main * dropped loader pickling * added pad if smaller * added more augmentation to the patching seq * added mosaic padding to loaders updated augmentations * added dev mode, updated config, added specialists with patching * fixed mosaic loader bug * Update main.py dropped debug saving * updated postprocessing, added fixes to patching * updated postprocessing, added devmode, fixed loaders, changed mask preprocessing to get full masks and internal contours * fixed mosaic for larger patches, adjusted min blob size in postpro * pipelines with specialists and mulit with patching are working, dropped 0 channel load from loaders, minor fixes in loss def * added small random crop/pads, fixed pipelines for no patching mode, added simple validation mode * added artifact images to train * added global seeding * fixed checkersboard effect * added normalization * added blur to augmentations, added wireframe of scaling pipeline, reverted to vanila postprocessing * added trainable rescaling loop * fixed contour regeneration bug * refactored contour generation, upgraded contour generation in rescaling, cleaned pipelines * added dev and simple cv models, added caching to inference pipeline * added stain deconvolution * fixed image loading for grey images * fixed normalization of patches * moved stand alone notebooks to dir, dropped specialists, refactored pipelines * fixed pipelines updated configs * added kaggle notebooks, small refactor in pipelines, preprocessing clean up * added generation of matadata and corresponding target masks for external datasets, updated configs * updated augmentation * fixed train valid split for vgg clustering version * fixed train valid split on clusters with external * optimized imports, dropped plot_list() from utils.py * added color_seq_RGB * corrected best_configs * bug fix * added dummy load save to base transformer and dropped redundant stuff… (#69) * added dummy load save to base transformer and dropped redundant stuff, added chunking * Update postprocessing.py * Update preprocessing.py * Dev stage2 (#74) * added run end to end with configs, addec competition_stage parameter * added postpro dev to pipeline * Update neptune_rescaled_patched.yaml * Update neptune_rescaled_patched.yaml * Update neptune_rescaled_patched.yaml * Update neptune_size_estimator.yaml * Update run_end_to_end.sh --- augmentation.py | 130 ++- best_configs/neptune_rescaled_patched.yaml | 117 +++ .../neptune_size_estimator_training.yaml | 117 +++ callbacks.py | 9 +- .../neptune_rescaled_patched.yaml | 120 +++ .../neptune_size_estimator.yaml | 120 +++ devbook.ipynb | 253 ----- loaders.py | 230 ++++- main.py | 219 ++++- metrics.py | 23 +- models.py | 27 +- neptune.yaml | 97 +- notebooks/dataset_of_artifacts.ipynb | 126 +++ .../morphological_postprocessing.ipynb | 4 +- notebooks/stain_deconvolution.ipynb | 246 +++++ notebooks/trainable_rescale.ipynb | 175 ++++ pipeline_config.py | 82 +- pipelines.py | 871 ++++++++++++------ postprocessing.py | 269 ++++-- preparation.py | 105 ++- preprocessing.py | 196 ++++ run_end_to_end.sh | 30 + steps/base.py | 9 +- steps/preprocessing.py | 55 -- steps/pytorch/callbacks.py | 2 + steps/pytorch/loaders.py | 2 +- steps/pytorch/models.py | 26 +- steps/pytorch/utils.py | 5 +- steps/pytorch/validation.py | 13 +- utils.py | 77 +- 30 files changed, 2830 insertions(+), 925 deletions(-) create mode 100644 best_configs/neptune_rescaled_patched.yaml create mode 100644 best_configs/neptune_size_estimator_training.yaml create mode 100644 configs_end_to_end/neptune_rescaled_patched.yaml create mode 100644 configs_end_to_end/neptune_size_estimator.yaml delete mode 100644 devbook.ipynb create mode 100644 notebooks/dataset_of_artifacts.ipynb rename kaggle_morhpological_postprocessing.ipynb => notebooks/morphological_postprocessing.ipynb (99%) create mode 100644 notebooks/stain_deconvolution.ipynb create mode 100644 notebooks/trainable_rescale.ipynb create mode 100644 preprocessing.py create mode 100644 run_end_to_end.sh diff --git a/augmentation.py b/augmentation.py index 16f73c3..97e178d 100644 --- a/augmentation.py +++ b/augmentation.py @@ -1,3 +1,4 @@ +import numpy as np from imgaug import augmenters as iaa affine_seq = iaa.Sequential([ @@ -14,23 +15,116 @@ ], random_order=True) color_seq = iaa.Sequential([ - # Color - iaa.OneOf([ - iaa.Sequential([ - iaa.ChangeColorspace(from_colorspace="RGB", to_colorspace="HSV"), - iaa.WithChannels(0, iaa.Add((0, 100))), - iaa.ChangeColorspace(from_colorspace="HSV", to_colorspace="RGB")]), - iaa.Sequential([ - iaa.ChangeColorspace(from_colorspace="RGB", to_colorspace="HSV"), - iaa.WithChannels(1, iaa.Add((0, 100))), - iaa.ChangeColorspace(from_colorspace="HSV", to_colorspace="RGB")]), - iaa.Sequential([ - iaa.ChangeColorspace(from_colorspace="RGB", to_colorspace="HSV"), - iaa.WithChannels(2, iaa.Add((0, 100))), - iaa.ChangeColorspace(from_colorspace="HSV", to_colorspace="RGB")]), - iaa.WithChannels(0, iaa.Add((0, 100))), - iaa.WithChannels(1, iaa.Add((0, 100))), - iaa.WithChannels(2, iaa.Add((0, 100))) - ]) + iaa.Sometimes(0.5, iaa.OneOf([iaa.AverageBlur(k=((5, 11), (5, 11))), + iaa.AdditiveGaussianNoise(scale=0.05 * 255, per_channel=0.5) + ])) ], random_order=True) +color_seq_RGB = iaa.Sequential([ + iaa.SomeOf((1, 2), + [iaa.Sequential([ + iaa.ChangeColorspace(from_colorspace="RGB", to_colorspace="HSV"), + iaa.WithChannels(0, iaa.Add((0, 100))), + iaa.ChangeColorspace(from_colorspace="HSV", to_colorspace="RGB")]), + iaa.Sequential([ + iaa.ChangeColorspace(from_colorspace="RGB", to_colorspace="HSV"), + iaa.WithChannels(1, iaa.Add((0, 100))), + iaa.ChangeColorspace(from_colorspace="HSV", to_colorspace="RGB")]), + iaa.Sequential([ + iaa.ChangeColorspace(from_colorspace="RGB", to_colorspace="HSV"), + iaa.WithChannels(2, iaa.Add((0, 100))), + iaa.ChangeColorspace(from_colorspace="HSV", to_colorspace="RGB")]), + iaa.WithChannels(0, iaa.Add((0, 100))), + iaa.WithChannels(1, iaa.Add((0, 100))), + iaa.WithChannels(2, iaa.Add((0, 100)))] + ), + iaa.Sometimes(0.5, iaa.OneOf([iaa.AverageBlur(k=((5, 11), (5, 11))), + iaa.AdditiveGaussianNoise(scale=0.05 * 255, per_channel=0.5)]) + ) +], random_order=True) + + +def patching_seq(crop_size): + h, w = crop_size + + seq = iaa.Sequential([ + iaa.Affine(rotate=(0, 360)), + CropFixed(px=h), + iaa.Fliplr(0.5), + iaa.Flipud(0.5), + iaa.Sometimes(0.5, iaa.CropAndPad(percent=(-0.1, 0.1), pad_cval=0)), + iaa.Sometimes(0.5, iaa.PiecewiseAffine(scale=(0.02, 0.06))) + ], random_order=False) + return seq + + +class CropFixed(iaa.Augmenter): + def __init__(self, px=None, name=None, deterministic=False, random_state=None): + super(CropFixed, self).__init__(name=name, deterministic=deterministic, random_state=random_state) + self.px = px + + def _augment_images(self, images, random_state, parents, hooks): + + result = [] + seeds = random_state.randint(0, 10 ** 6, (len(images),)) + for i, image in enumerate(images): + seed = seeds[i] + image_cr = self._random_crop_or_pad(seed, image) + result.append(image_cr) + return result + + def _augment_keypoints(self, keypoints_on_images, random_state, parents, hooks): + result = [] + return result + + def _random_crop_or_pad(self, seed, image): + height, width = image.shape[:2] + + if height <= self.px and width > self.px: + image_processed = self._random_crop(seed, image, crop_h=False, crop_w=True) + image_processed = self._pad(image_processed) + elif height > self.px and width <= self.px: + image_processed = self._random_crop(seed, image, crop_h=True, crop_w=False) + image_processed = self._pad(image_processed) + elif height <= self.px and width <= self.px: + image_processed = self._pad(image) + else: + image_processed = self._random_crop(seed, image, crop_h=True, crop_w=True) + return image_processed + + def _random_crop(self, seed, image, crop_h=True, crop_w=True): + height, width = image.shape[:2] + + if crop_h: + np.random.seed(seed) + crop_top = np.random.randint(height - self.px) + crop_bottom = crop_top + self.px + else: + crop_top, crop_bottom = (0, height) + + if crop_w: + np.random.seed(seed + 1) + crop_left = np.random.randint(width - self.px) + crop_right = crop_left + self.px + else: + crop_left, crop_right = (0, width) + + if len(image.shape) == 2: + image_cropped = image[crop_top:crop_bottom, crop_left:crop_right] + else: + image_cropped = image[crop_top:crop_bottom, crop_left:crop_right, :] + return image_cropped + + def _pad(self, image): + if len(image.shape) == 2: + height, width = image.shape + image_padded = np.zeros((max(height, self.px), max(width, self.px))).astype(np.uint8) + image_padded[:height, :width] = image + else: + height, width, channels = image.shape + image_padded = np.zeros((max(height, self.px), max(width, self.px), channels)).astype(np.uint8) + image_padded[:height, :width, :] = image + return image_padded + + def get_parameters(self): + return [] diff --git a/best_configs/neptune_rescaled_patched.yaml b/best_configs/neptune_rescaled_patched.yaml new file mode 100644 index 0000000..07dc93a --- /dev/null +++ b/best_configs/neptune_rescaled_patched.yaml @@ -0,0 +1,117 @@ +project-key: DSB + +name: dsb_open_solution +tags: [solution_5] + +metric: + channel: 'Final Validation Score' + goal: maximize + +#Comment out if not in Cloud Environment +pip-requirements-file: requirements.txt + +exclude: + - .git + - .idea + - .ipynb_checkpoints + - output + - imgs + - neptune.log + - offline_job.log + - notebooks + +parameters: +# Cloud Environment + data_dir: /public/dsb_2018_data/ + meta_dir: /public/dsb_2018_data/ + external_data_dirs: /public/dsb_2018_data/external_data/ + masks_overlayed_dir: /public/dsb_2018_data/masks_overlayed/ + contours_overlayed_dir: /public/dsb_2018_data/contours_overlayed/ + centers_overlayed_dir: /public/dsb_2018_data/centers_overlayed/ + experiment_dir: /output/dsb/experiments/ + +# Local Environment +# data_dir: /path/to/data +# meta_dir: /path/to/data +# external_data_dirs: /path/to/external/data +# masks_overlayed_dir: /path/to/masks_overlayed +# contours_overlayed_dir: /path/to/contours_overlayed +# centers_overlayed_dir: /path/to/centers_overlayed +# experiment_dir: /path/to/work/dir + +# General parameters + valid_category_ids: '[0, 1]' + overwrite: 0 + num_workers: 4 + load_in_memory: 1 + pin_memory: 1 + use_patching: 1 + patching_stride: 256 + +# Image parameters (size estimator) + size_estimator__image_h: 512 + size_estimator__image_w: 512 + size_estimator__image_channels: 1 + +# U-Net parameters (size estimator) + size_estimator__nr_unet_outputs: 3 + size_estimator__n_filters: 16 + size_estimator__conv_kernel: 3 + size_estimator__pool_kernel: 3 + size_estimator__pool_stride: 2 + size_estimator__repeat_blocks: 4 + +# U-Net loss weights (size estimator) + size_estimator__mask: 0.75 + size_estimator__contour: 1.0 + size_estimator__center: 0.25 + size_estimator__bce_mask: 1.0 + size_estimator__dice_mask: 1.0 + size_estimator__bce_contour: 1.0 + size_estimator__dice_contour: 1.0 + size_estimator__bce_center: 1.0 + size_estimator__dice_center: 1.0 + +# Image parameters (multi-output) + image_h: 512 + image_w: 512 + image_channels: 1 + +# U-Net parameters (multi-output) + nr_unet_outputs: 3 + n_filters: 16 + conv_kernel: 3 + pool_kernel: 3 + pool_stride: 2 + repeat_blocks: 4 + +# U-Net loss weights (multi-output) + mask: 0.75 + contour: 1.0 + center: 0.25 + bce_mask: 1.0 + dice_mask: 1.0 + bce_contour: 1.0 + dice_contour: 1.0 + bce_center: 1.0 + dice_center: 1.0 + +# Training schedule + epochs_nr: 1000 + batch_size_train: 4 + batch_size_inference: 4 + lr: 0.0002 + momentum: 0.9 + gamma: 1.0 + patience: 50 + +# Regularization + use_batch_norm: 1 + l2_reg_conv: 0.00005 + l2_reg_dense: 0.0 + dropout_conv: 0.1 + dropout_dense: 0.0 + +# Postprocessing + threshold: 0.5 + min_nuclei_size: 20 diff --git a/best_configs/neptune_size_estimator_training.yaml b/best_configs/neptune_size_estimator_training.yaml new file mode 100644 index 0000000..4e1f5d8 --- /dev/null +++ b/best_configs/neptune_size_estimator_training.yaml @@ -0,0 +1,117 @@ +project-key: DSB + +name: dsb_open_solution +tags: [solution_5] + +metric: + channel: 'Final Validation Score' + goal: maximize + +#Comment out if not in Cloud Environment +pip-requirements-file: requirements.txt + +exclude: + - .git + - .idea + - .ipynb_checkpoints + - output + - imgs + - neptune.log + - offline_job.log + - notebooks + +parameters: +# Cloud Environment + data_dir: /public/dsb_2018_data/ + meta_dir: /public/dsb_2018_data/ + external_data_dirs: /public/dsb_2018_data/external_data/ + masks_overlayed_dir: /public/dsb_2018_data/masks_overlayed/ + contours_overlayed_dir: /public/dsb_2018_data/contours_overlayed/ + centers_overlayed_dir: /public/dsb_2018_data/centers_overlayed/ + experiment_dir: /output/dsb/experiments/ + +# Local Environment +# data_dir: /path/to/data +# meta_dir: /path/to/data +# external_data_dirs: /path/to/external/data +# masks_overlayed_dir: /path/to/masks_overlayed +# contours_overlayed_dir: /path/to/contours_overlayed +# centers_overlayed_dir: /path/to/centers_overlayed +# experiment_dir: /path/to/work/dir + +# General parameters + valid_category_ids: '[0, 1]' + overwrite: 1 + num_workers: 4 + load_in_memory: 1 + pin_memory: 1 + use_patching: 1 + patching_stride: 256 + +# Image parameters (size estimator) + size_estimator__image_h: 512 + size_estimator__image_w: 512 + size_estimator__image_channels: 1 + +# U-Net parameters (size estimator) + size_estimator__nr_unet_outputs: 3 + size_estimator__n_filters: 16 + size_estimator__conv_kernel: 3 + size_estimator__pool_kernel: 3 + size_estimator__pool_stride: 2 + size_estimator__repeat_blocks: 4 + +# U-Net loss weights (size estimator) + size_estimator__mask: 0.75 + size_estimator__contour: 1.0 + size_estimator__center: 0.25 + size_estimator__bce_mask: 1.0 + size_estimator__dice_mask: 1.0 + size_estimator__bce_contour: 1.0 + size_estimator__dice_contour: 1.0 + size_estimator__bce_center: 1.0 + size_estimator__dice_center: 1.0 + +# Image parameters (multi-output) + image_h: 512 + image_w: 512 + image_channels: 1 + +# U-Net parameters (multi-output) + nr_unet_outputs: 3 + n_filters: 16 + conv_kernel: 3 + pool_kernel: 3 + pool_stride: 2 + repeat_blocks: 4 + +# U-Net loss weights (multi-output) + mask: 0.75 + contour: 1.0 + center: 0.25 + bce_mask: 1.0 + dice_mask: 1.0 + bce_contour: 1.0 + dice_contour: 1.0 + bce_center: 1.0 + dice_center: 1.0 + +# Training schedule + epochs_nr: 1000 + batch_size_train: 4 + batch_size_inference: 4 + lr: 0.0002 + momentum: 0.9 + gamma: 1.0 + patience: 50 + +# Regularization + use_batch_norm: 1 + l2_reg_conv: 0.00005 + l2_reg_dense: 0.0 + dropout_conv: 0.1 + dropout_dense: 0.0 + +# Postprocessing + threshold: 0.5 + min_nuclei_size: 20 diff --git a/callbacks.py b/callbacks.py index 3887264..ea54ea9 100644 --- a/callbacks.py +++ b/callbacks.py @@ -1,9 +1,8 @@ -from PIL import Image import numpy as np import torch -from torch.autograd import Variable +from PIL import Image from deepsense import neptune - +from torch.autograd import Variable from steps.pytorch.callbacks import NeptuneMonitor from utils import sigmoid @@ -56,9 +55,9 @@ def get_prediction_masks(self): targets_tensors = data[1:] if torch.cuda.is_available(): - X = Variable(X).cuda() + X = Variable(X, volatile=True).cuda() else: - X = Variable(X) + X = Variable(X, volatile=True) outputs_batch = self.model(X) if len(outputs_batch) == len(self.output_names): diff --git a/configs_end_to_end/neptune_rescaled_patched.yaml b/configs_end_to_end/neptune_rescaled_patched.yaml new file mode 100644 index 0000000..ed976d0 --- /dev/null +++ b/configs_end_to_end/neptune_rescaled_patched.yaml @@ -0,0 +1,120 @@ +project-key: DSB + +name: dsb_open_solution +tags: [solution_5, stage2] + +metric: + channel: 'Final Validation Score' + goal: maximize + +#Comment out if not in Cloud Environment +pip-requirements-file: requirements.txt + +exclude: + - .git + - .idea + - .ipynb_checkpoints + - output + - imgs + - neptune.log + - offline_job.log + - notebooks + +parameters: +# Cloud Environment + data_dir: /public/dsb_2018_data/ + meta_dir: /public/dsb_2018_data/ + external_data_dirs: /public/dsb_2018_data/external_data/ + masks_overlayed_dir: /public/dsb_2018_data/masks_overlayed/ + contours_overlayed_dir: /public/dsb_2018_data/contours_overlayed/ + centers_overlayed_dir: /public/dsb_2018_data/centers_overlayed/ + experiment_dir: /output/dsb/experiments/ + +# Local Environment +# data_dir: /path/to/data +# meta_dir: /path/to/data +# external_data_dirs: /path/to/external/data +# masks_overlayed_dir: /path/to/masks_overlayed +# contours_overlayed_dir: /path/to/contours_overlayed +# centers_overlayed_dir: /path/to/centers_overlayed +# experiment_dir: /path/to/work/dir + +# General parameters + competition_stage: 2 + valid_category_ids: '[0, 1]' + overwrite: 0 + num_workers: 4 + load_in_memory: 1 + pin_memory: 1 + use_patching: 1 + patching_stride: 256 + +# Image parameters (size estimator) + size_estimator__image_h: 512 + size_estimator__image_w: 512 + size_estimator__image_channels: 1 + +# U-Net parameters (size estimator) + size_estimator__nr_unet_outputs: 3 + size_estimator__n_filters: 16 + size_estimator__conv_kernel: 3 + size_estimator__pool_kernel: 3 + size_estimator__pool_stride: 2 + size_estimator__repeat_blocks: 4 + +# U-Net loss weights (size estimator) + size_estimator__mask: 0.75 + size_estimator__contour: 1.0 + size_estimator__contour_touching: 0.0 + size_estimator__center: 0.25 + size_estimator__bce_mask: 1.0 + size_estimator__dice_mask: 1.0 + size_estimator__bce_contour: 1.0 + size_estimator__dice_contour: 1.0 + size_estimator__bce_center: 1.0 + size_estimator__dice_center: 1.0 + +# Image parameters (multi-output) + image_h: 512 + image_w: 512 + image_channels: 1 + +# U-Net parameters (multi-output) + nr_unet_outputs: 3 + n_filters: 16 + conv_kernel: 3 + pool_kernel: 3 + pool_stride: 2 + repeat_blocks: 4 + +# U-Net loss weights (multi-output) + mask: 0.75 + contour: 1.0 + contour_touching: 0.0 + center: 0.25 + bce_mask: 1.0 + dice_mask: 1.0 + bce_contour: 1.0 + dice_contour: 1.0 + bce_center: 1.0 + dice_center: 1.0 + +# Training schedule + epochs_nr: 1000 + batch_size_train: 4 + batch_size_inference: 4 + lr: 0.0001 + momentum: 0.9 + gamma: 0.99 + patience: 50 + +# Regularization + use_batch_norm: 1 + l2_reg_conv: 0.0001 + l2_reg_dense: 0.0 + dropout_conv: 0.1 + dropout_dense: 0.0 + +# Postprocessing + threshold: 0.5 + min_nuclei_size: 20 diff --git a/configs_end_to_end/neptune_size_estimator.yaml b/configs_end_to_end/neptune_size_estimator.yaml new file mode 100644 index 0000000..b583426 --- /dev/null +++ b/configs_end_to_end/neptune_size_estimator.yaml @@ -0,0 +1,120 @@ +project-key: DSB + +name: dsb_open_solution +tags: [solution_5, stage2] + +metric: + channel: 'Final Validation Score' + goal: maximize + +#Comment out if not in Cloud Environment +pip-requirements-file: requirements.txt + +exclude: + - .git + - .idea + - .ipynb_checkpoints + - output + - imgs + - neptune.log + - offline_job.log + - notebooks + +parameters: +# Cloud Environment + data_dir: /public/dsb_2018_data/ + meta_dir: /public/dsb_2018_data/ + external_data_dirs: /public/dsb_2018_data/external_data/ + masks_overlayed_dir: /public/dsb_2018_data/masks_overlayed/ + contours_overlayed_dir: /public/dsb_2018_data/contours_overlayed/ + centers_overlayed_dir: /public/dsb_2018_data/centers_overlayed/ + experiment_dir: /output/dsb/experiments/ + +# Local Environment +# data_dir: /path/to/data +# meta_dir: /path/to/data +# external_data_dirs: /path/to/external/data +# masks_overlayed_dir: /path/to/masks_overlayed +# contours_overlayed_dir: /path/to/contours_overlayed +# centers_overlayed_dir: /path/to/centers_overlayed +# experiment_dir: /path/to/work/dir + +# General parameters + competition_stage: 2 + valid_category_ids: '[0, 1]' + overwrite: 0 + num_workers: 4 + load_in_memory: 1 + pin_memory: 1 + use_patching: 0 + patching_stride: 256 + +# Image parameters (size estimator) + size_estimator__image_h: 512 + size_estimator__image_w: 512 + size_estimator__image_channels: 1 + +# U-Net parameters (size estimator) + size_estimator__nr_unet_outputs: 3 + size_estimator__n_filters: 16 + size_estimator__conv_kernel: 3 + size_estimator__pool_kernel: 3 + size_estimator__pool_stride: 2 + size_estimator__repeat_blocks: 4 + +# U-Net loss weights (size estimator) + size_estimator__mask: 0.75 + size_estimator__contour: 1.0 + size_estimator__contour_touching: 0.0 + size_estimator__center: 0.25 + size_estimator__bce_mask: 1.0 + size_estimator__dice_mask: 1.0 + size_estimator__bce_contour: 1.0 + size_estimator__dice_contour: 1.0 + size_estimator__bce_center: 1.0 + size_estimator__dice_center: 1.0 + +# Image parameters (multi-output) + image_h: 512 + image_w: 512 + image_channels: 1 + +# U-Net parameters (multi-output) + nr_unet_outputs: 3 + n_filters: 16 + conv_kernel: 3 + pool_kernel: 3 + pool_stride: 2 + repeat_blocks: 4 + +# U-Net loss weights (multi-output) + mask: 0.75 + contour: 1.0 + contour_touching: 0.0 + center: 0.25 + bce_mask: 1.0 + dice_mask: 1.0 + bce_contour: 1.0 + dice_contour: 1.0 + bce_center: 1.0 + dice_center: 1.0 + +# Training schedule + epochs_nr: 1000 + batch_size_train: 4 + batch_size_inference: 4 + lr: 0.0001 + momentum: 0.9 + gamma: 0.99 + patience: 50 + +# Regularization + use_batch_norm: 1 + l2_reg_conv: 0.0001 + l2_reg_dense: 0.0 + dropout_conv: 0.1 + dropout_dense: 0.0 + +# Postprocessing + threshold: 0.5 + min_nuclei_size: 20 diff --git a/devbook.ipynb b/devbook.ipynb deleted file mode 100644 index 4281f9c..0000000 --- a/devbook.ipynb +++ /dev/null @@ -1,253 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "\n", - "import os\n", - "\n", - "import ipywidgets as ipy\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "from sklearn.externals import joblib\n", - "from skimage.color import label2rgb\n", - "\n", - "DATA_DIR = '/public/dsb_2018_data'\n", - "PREDICTION_TRAIN = 'predictions/unet/train/outputs'\n", - "PREDICTION_VALID = 'predictions/unet/valid/outputs'\n", - "PREDICTION_TEST = 'predictions/unet/test/outputs'\n", - "\n", - "SUBMISSION_PATH = os.path.join('/output/submission.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "meta = pd.read_csv(os.path.join(DATA_DIR,'stage1_metadata.csv'))\n", - "meta_train = meta[meta['is_train']==1 & (~meta['vgg_features_clusters'].isin([0,1]))]\n", - "meta_valid = meta[meta['is_train']==1 & (meta['vgg_features_clusters'].isin([0,1]))]\n", - "meta_test = meta[meta['is_train']==0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_with_labels(mask, contour, postprocessed, label, idx):\n", - " plt.figure(figsize=(16,12))\n", - " plt.subplot(141)\n", - " plt.imshow(mask[idx])\n", - " plt.subplot(142)\n", - " plt.imshow(contour[idx])\n", - " plt.subplot(143)\n", - " plt.imshow(label2rgb(postprocessed[idx]))\n", - " plt.subplot(144)\n", - " plt.imshow(label2rgb(label[idx]))\n", - " plt.show()\n", - " \n", - "def plot_predictions(mask, contour, postprocessed, idx):\n", - " plt.figure(figsize=(16,12))\n", - " plt.subplot(131)\n", - " plt.imshow(mask[idx])\n", - " plt.subplot(132)\n", - " plt.imshow(contour[idx])\n", - " plt.subplot(133)\n", - " plt.imshow(label2rgb(postprocessed[idx]))\n", - " plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Predictions and labels on train" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "labels_train = joblib.load(os.path.join(DATA_DIR,'ground_truth','train','labels.pkl'))\n", - "masks_train = joblib.load(os.path.join(DATA_DIR,PREDICTION_TRAIN,'unet_multitask'))['mask_prediction']\n", - "contours_train = joblib.load(os.path.join(DATA_DIR,PREDICTION_TRAIN,'unet_multitask'))['contour_prediction']\n", - "postprocessed_train = joblib.load(os.path.join(DATA_DIR,PREDICTION_TRAIN,'binary_fill'))['filled_images']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ipy.interact(plot_with_labels, mask = ipy.fixed(masks_train),\n", - " contour = ipy.fixed(contours_train),\n", - " postprocessed = ipy.fixed(postprocessed_train),\n", - " label = ipy.fixed(labels_train),\n", - " idx = ipy.IntSlider(min=0, max=50, value=0, step=1))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Predictions and labels on valid" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "labels_valid = joblib.load(os.path.join(DATA_DIR,'ground_truth','valid','labels.pkl'))\n", - "masks_valid = joblib.load(os.path.join(DATA_DIR,PREDICTION_VALID,'unet_multitask'))['mask_prediction']\n", - "contours_valid = joblib.load(os.path.join(DATA_DIR,PREDICTION_VALID,'unet_multitask'))['contour_prediction']\n", - "postprocessed_valid = joblib.load(os.path.join(DATA_DIR,PREDICTION_VALID,'binary_fill'))['filled_images']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ipy.interact(plot_with_labels, mask = ipy.fixed(masks_valid),\n", - " contour = ipy.fixed(contours_valid),\n", - " postprocessed = ipy.fixed(postprocessed_valid),\n", - " label = ipy.fixed(labels_valid),\n", - " idx = ipy.IntSlider(min=0, max=50, value=0, step=1))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Predictions and submission on test" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "masks_test = joblib.load(os.path.join(DATA_DIR,PREDICTION_TEST,'unet_multitask'))['mask_prediction']\n", - "contours_test = joblib.load(os.path.join(DATA_DIR,PREDICTION_TEST,'unet_multitask'))['contour_prediction']\n", - "postprocessed_test = joblib.load(os.path.join(DATA_DIR,PREDICTION_TEST,'binary_fill'))['filled_images']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ipy.interact(plot_predictions, mask = ipy.fixed(masks_test),\n", - " contour = ipy.fixed(contours_test),\n", - " postprocessed = ipy.fixed(postprocessed_test),\n", - " idx = ipy.IntSlider(min=0, max=50, value=0, step=1))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def decompose(labeled):\n", - "\n", - " nr_true = labeled.max()\n", - " masks = []\n", - " for i in range(1, nr_true + 1):\n", - " msk = labeled.copy()\n", - " msk[msk != i] = 0.\n", - " msk[msk == i] = 255.\n", - " masks.append(msk)\n", - "\n", - " if not masks:\n", - " return [labeled]\n", - " else:\n", - " return masks\n", - "\n", - "def run_length_encoding(x):\n", - " dots = np.where(x.T.flatten() == 1)[0] \n", - " run_lengths = []\n", - " prev = -2\n", - " for b in dots:\n", - " if (b > prev + 1): run_lengths.extend((b + 1, 0))\n", - " run_lengths[-1] += 1\n", - " prev = b\n", - " return run_lengths\n", - " \n", - "def generate_submission(predictions, meta):\n", - " image_ids, encodings = [], []\n", - " for image_id, prediction in zip(meta['ImageId'].values, predictions):\n", - " for mask in decompose(prediction):\n", - " image_ids.append(image_id)\n", - " encodings.append(' '.join(str(rle) for rle in run_length_encoding(mask > 128.)))\n", - "\n", - " submission = pd.DataFrame({'ImageId': image_ids, 'EncodedPixels': encodings})\n", - " return submission" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "submission = generate_submission(postprocessed_test, meta_test)\n", - "submission.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "submission.to_csv(SUBMISSION_PATH, index=None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dl_py3", - "language": "python", - "name": "dl_py3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/loaders.py b/loaders.py index a93a2eb..222f864 100644 --- a/loaders.py +++ b/loaders.py @@ -1,12 +1,17 @@ +import math +from itertools import product + import numpy as np +import pandas as pd import torch import torchvision.transforms as transforms from PIL import Image from attrdict import AttrDict +from skimage.transform import resize from sklearn.externals import joblib from torch.utils.data import Dataset, DataLoader -from augmentation import affine_seq, color_seq +from augmentation import affine_seq, color_seq, patching_seq from steps.base import BaseTransformer from steps.pytorch.utils import ImgAug from utils import from_pil, to_pil @@ -208,18 +213,16 @@ def __getitem__(self, index): return Xi -class MetadataImageSegmentationLoader(BaseTransformer): +class ImageSegmentationLoaderBasic(BaseTransformer): def __init__(self, loader_params, dataset_params): super().__init__() self.loader_params = AttrDict(loader_params) self.dataset_params = AttrDict(dataset_params) - self.dataset = MetadataImageSegmentationDataset self.image_transform = transforms.Compose([transforms.Resize((self.dataset_params.h, self.dataset_params.w)), transforms.ToTensor(), - transforms.Normalize(mean=[0.5, 0.5, 0.5], - std=[0.2, 0.2, 0.2]), + transforms.Normalize(mean=[0.11], std=[0.09]), ]) self.mask_transform = transforms.Compose([transforms.Resize((self.dataset_params.h, self.dataset_params.w)), @@ -229,6 +232,8 @@ def __init__(self, loader_params, dataset_params): self.image_augment_with_target = ImgAug(affine_seq) self.image_augment = ImgAug(color_seq) + self.dataset = None + def transform(self, X, y, X_valid=None, y_valid=None, train_mode=True): if train_mode and y is not None: flow, steps = self.get_datagen(X, y, True, self.loader_params.training) @@ -263,34 +268,158 @@ def get_datagen(self, X, y, train_mode, loader_params): steps = len(datagen) return datagen, steps - def load(self, filepath): - params = joblib.load(filepath) - self.loader_params = params['loader_params'] - return self - def save(self, filepath): - params = {'loader_params': self.loader_params} - joblib.dump(params, filepath) +class ImageSegmentationLoaderPatchingTrain(ImageSegmentationLoaderBasic): + def __init__(self, loader_params, dataset_params): + super().__init__(loader_params, dataset_params) + + self.image_augment_with_target = ImgAug(patching_seq(crop_size=(self.dataset_params.h, + self.dataset_params.w))) + self.image_augment = ImgAug(color_seq) + self.dataset = None + + +class ImageSegmentationLoaderPatchingInference(ImageSegmentationLoaderBasic): + def __init__(self, loader_params, dataset_params): + super().__init__(loader_params, dataset_params) + + self.image_augment_with_target = ImgAug(patching_seq(crop_size=(self.dataset_params.h, + self.dataset_params.w))) + self.image_augment = ImgAug(color_seq) + + self.dataset = None + + def transform(self, X, y, X_valid=None, y_valid=None, train_mode=True): + X, patch_ids = self.get_patches(X) + + flow, steps = self.get_datagen(X, None, False, self.loader_params.inference) + valid_flow = None + valid_steps = None + return {'datagen': (flow, steps), + 'patch_ids': patch_ids, + 'validation_datagen': (valid_flow, valid_steps)} + + def get_datagen(self, X, y, train_mode, loader_params): + dataset = self.dataset(X, None, + train_mode=False, + image_augment=None, + image_augment_with_target=None, + mask_transform=self.mask_transform, + image_transform=self.image_transform) + + datagen = DataLoader(dataset, **loader_params) + steps = len(datagen) + return datagen, steps -class MetadataImageSegmentationMultitaskLoader(MetadataImageSegmentationLoader): + def get_patches(self, X): + patches, patch_ids, tta_angles, patch_y_coords, patch_x_coords, image_h, image_w = [], [], [], [], [], [], [] + for i, image in enumerate((X[0])): + image = from_pil(image) + h, w = image.shape[:2] + for y_coord, x_coord, image_patch in generate_patches(image, self.dataset_params.h, + self.dataset_params.patching_stride): + for tta_rotation_angle, image_patch_tta in test_time_augmentation(image_patch): + image_patch_tta = to_pil(image_patch_tta) + patches.append(image_patch_tta) + patch_ids.append(i) + tta_angles.append(tta_rotation_angle) + patch_y_coords.append(y_coord) + patch_x_coords.append(x_coord) + image_h.append(h) + image_w.append(w) + + patch_ids = pd.DataFrame({'patch_ids': patch_ids, + 'tta_angles': tta_angles, + 'y_coordinates': patch_y_coords, + 'x_coordinates': patch_x_coords, + 'image_h': image_h, + 'image_w': image_w}) + return [patches], patch_ids + + +class MetadataImageSegmentationLoader(ImageSegmentationLoaderBasic): + def __init__(self, loader_params, dataset_params): + super().__init__(loader_params, dataset_params) + self.dataset = MetadataImageSegmentationDataset + + +class MetadataImageSegmentationMultitaskLoader(ImageSegmentationLoaderBasic): def __init__(self, loader_params, dataset_params): super().__init__(loader_params, dataset_params) self.dataset = MetadataImageSegmentationMultitaskDataset -class ImageSegmentationLoader(MetadataImageSegmentationLoader): +class ImageSegmentationLoader(ImageSegmentationLoaderBasic): def __init__(self, loader_params, dataset_params): super().__init__(loader_params, dataset_params) self.dataset = ImageSegmentationDataset -class ImageSegmentationMultitaskLoader(MetadataImageSegmentationLoader): +class ImageSegmentationMultitaskLoader(ImageSegmentationLoaderBasic): + def __init__(self, loader_params, dataset_params): + super().__init__(loader_params, dataset_params) + self.dataset = ImageSegmentationMultitaskDataset + + +class ImageSegmentationMultitaskLoaderPatchingTrain(ImageSegmentationLoaderPatchingTrain): + def __init__(self, loader_params, dataset_params): + super().__init__(loader_params, dataset_params) + self.dataset = ImageSegmentationMultitaskDataset + + +class ImageSegmentationMultitaskLoaderPatchingInference(ImageSegmentationLoaderPatchingInference): def __init__(self, loader_params, dataset_params): super().__init__(loader_params, dataset_params) self.dataset = ImageSegmentationMultitaskDataset +class PatchCombiner(BaseTransformer): + def __init__(self, patching_size, patching_stride): + super().__init__() + self.patching_size = patching_size + self.patching_stride = patching_stride + self.tta_factor = 4 + + @property + def normalization_factor(self): + return self.tta_factor * int(self.patching_size / self.patching_stride) ** 2 + + def transform(self, outputs, patch_ids): + combined_outputs = {} + for name, output in outputs.items(): + for patch_id in patch_ids['patch_ids'].unique(): + patch_meta = patch_ids[patch_ids['patch_ids'] == patch_id] + image_patches = output[patch_meta.index] + combined_outputs.setdefault(name, []).append(self._join_output(patch_meta, image_patches)) + return combined_outputs + + def _join_output(self, patch_meta, image_patches): + image_h = patch_meta['image_h'].unique()[0] + image_w = patch_meta['image_w'].unique()[0] + prediction_image = np.zeros((image_h, image_w)) + prediction_image_padded = get_mosaic_padded_image(prediction_image, self.patching_size, self.patching_stride) + + patches_per_image = 0 + for (y_coordinate, x_coordinate, tta_angle), image_patch in zip( + patch_meta[['y_coordinates', 'x_coordinates', 'tta_angles']].values.tolist(), image_patches): + patches_per_image += 1 + image_patch = np.rot90(image_patch, -1 * tta_angle / 90.) + window_y, window_x = y_coordinate * self.patching_stride, x_coordinate * self.patching_stride + prediction_image_padded[window_y:self.patching_size + window_y, + window_x:self.patching_size + window_x] += image_patch + + _, h_top, h_bottom, _ = get_padded_size(max(image_h, self.patching_size), + self.patching_size, + self.patching_stride) + _, w_left, w_right, _ = get_padded_size(max(image_w, self.patching_size), + self.patching_size, + self.patching_stride) + + prediction_image = prediction_image_padded[h_top:-h_bottom, w_left:-w_right] + prediction_image /= self.normalization_factor + return prediction_image + def binarize(x): x_ = x.convert('L') # convert image to monochrome x_ = np.array(x_) @@ -302,3 +431,74 @@ def to_tensor(x): x_ = np.expand_dims(x, axis=0) x_ = torch.from_numpy(x_) return x_ + + +def test_time_augmentation(img): + for i in range(4): + yield i * 90, np.rot90(img, i) + + +def generate_patches(img, patch_size, patch_stride): + img_padded = get_mosaic_padded_image(img, patch_size, patch_stride) + h_pad, w_pad = img_padded.shape[:2] + + h_patch_nr = math.ceil(h_pad / patch_stride) - math.floor(patch_size / patch_stride) + w_patch_nr = math.ceil(w_pad / patch_stride) - math.floor(patch_size / patch_stride) + + for y_coordinate, x_coordinate in product(range(h_patch_nr), range(w_patch_nr)): + if len(img.shape) == 2: + img_patch = img_padded[y_coordinate * patch_stride:y_coordinate * patch_stride + patch_size, + x_coordinate * patch_stride:x_coordinate * patch_stride + patch_size] + else: + img_patch = img_padded[y_coordinate * patch_stride:y_coordinate * patch_stride + patch_size, + x_coordinate * patch_stride:x_coordinate * patch_stride + patch_size, :] + yield y_coordinate, x_coordinate, img_patch + + +def get_mosaic_padded_image(img, patch_size, patch_stride): + if len(img.shape) == 2: + h_, w_ = img.shape + c = 1 + img = np.expand_dims(img, axis=2) + squeeze_output = True + else: + h_, w_, c = img.shape + squeeze_output = False + + h, w = (max(h_, patch_size), max(w_, patch_size)) + if h > h_ or w > w_: + img = resize(img, (h, w), preserve_range=True) + + h_pad, h_pad_top, h_pad_bottom, h_pad_end = get_padded_size(h, patch_size, patch_stride) + w_pad, w_pad_left, w_pad_right, w_pad_end = get_padded_size(w, patch_size, patch_stride) + + img_padded = np.zeros((h_pad, w_pad, c)) + img_padded[h_pad_top:-h_pad_bottom, w_pad_left:-w_pad_right, :] = img + + img_padded[h_pad_top:-h_pad_bottom, :w_pad_left, :] = np.fliplr(img[:, :w_pad_left, :]) + img_padded[:h_pad_top, w_pad_left:-w_pad_right, :] = np.flipud(img[:h_pad_top, :, :]) + + img_padded[h_pad_top:-h_pad_bottom, -w_pad_right:-w_pad_right + w_pad_end, :] = np.fliplr( + img[:, -w_pad_right:-w_pad_right + w_pad_end, :]) + img_padded[-h_pad_bottom:-h_pad_bottom + h_pad_end, w_pad_left:-w_pad_right, :] = np.flipud( + img[-h_pad_bottom:-h_pad_bottom + h_pad_end, :, :]) + + if squeeze_output: + img_padded = np.squeeze(img_padded) + + return img_padded + + +def get_padded_size(img_size, patch_size, patch_stride): + min_image_size = img_size + 2 * patch_size + for img_size_padded in range(img_size, 6 * img_size, 1): + if (img_size_padded - patch_size) % patch_stride == 0 and img_size_padded >= min_image_size: + break + + diff = img_size_padded - img_size + pad_down, pad_up = patch_size, diff - patch_size + if pad_up > patch_size and img_size < patch_size: + pad_end = patch_size + else: + pad_end = pad_up + return img_size_padded, pad_down, pad_up, pad_end diff --git a/main.py b/main.py index 1b1f747..35c54cb 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,11 @@ import os import shutil +from multiprocessing import set_start_method + +set_start_method('spawn') import click +import glob import pandas as pd from deepsense import neptune @@ -9,12 +13,15 @@ from pipeline_config import SOLUTION_CONFIG, Y_COLUMNS_SCORING, SIZE_COLUMNS from pipelines import PIPELINES from preparation import train_valid_split, overlay_masks, overlay_contours, overlay_centers, get_vgg_clusters -from utils import get_logger, read_masks, read_params, create_submission, generate_metadata +from utils import init_logger, read_masks, read_params, create_submission, generate_metadata, set_seed, \ + generate_data_frame_chunks -logger = get_logger() +logger = init_logger() ctx = neptune.Context() params = read_params(ctx) +set_seed(1234) + @click.group() def action(): @@ -22,51 +29,96 @@ def action(): @action.command() -def prepare_metadata(): +@click.option('-g', '--calculate_vgg_clusters', help='whether vgg clusters should be created', is_flag=True, + required=False) +@click.option('-tr', '--train_data', help='calculate for train data', is_flag=True, required=False) +@click.option('-te', '--test_data', help='calculate for train data', is_flag=True, required=False) +def prepare_metadata(calculate_vgg_clusters,train_data, test_data): logger.info('creating metadata') meta = generate_metadata(data_dir=params.data_dir, masks_overlayed_dir=params.masks_overlayed_dir, contours_overlayed_dir=params.contours_overlayed_dir, - contours_touching_overlayed_dir = params.contours_touching_overlayed_dir, - centers_overlayed_dir=params.centers_overlayed_dir) - logger.info('calculating clusters') - - meta_train = meta[meta['is_train'] == 1] - meta_test = meta[meta['is_train'] == 0] - vgg_features_clusters = get_vgg_clusters(meta_train) - meta_train['vgg_features_clusters'] = vgg_features_clusters - meta_test['vgg_features_clusters'] = 'NaN' - meta = pd.concat([meta_train, meta_test], axis=0) - meta.to_csv(os.path.join(params.meta_dir, 'stage1_metadata.csv'), index=None) + centers_overlayed_dir=params.centers_overlayed_dir, + competition_stage=params.competition_stage, + process_train_data=train_data, + process_test_data=test_data) + meta['is_external'] = 0 + + if train_data: + for external_data_dir in glob.glob('{}/*'.format(params.external_data_dirs)): + logger.info('adding external metadata for {}'.format(external_data_dir)) + meta_external = generate_metadata(data_dir=external_data_dir, + masks_overlayed_dir=params.masks_overlayed_dir, + contours_overlayed_dir=params.contours_overlayed_dir, + centers_overlayed_dir=params.centers_overlayed_dir, + competition_stage=params.competition_stage, + process_train_data=train_data, + process_test_data=False) + meta_external['is_external'] = 1 + meta = pd.concat([meta, meta_external], axis=0) + + if calculate_vgg_clusters: + logger.info('calculating clusters') + meta_train = meta[meta['is_train'] == 1] + meta_test = meta[meta['is_train'] == 0] + + vgg_features_clusters = get_vgg_clusters(meta_train) + meta_train['vgg_features_clusters'] = vgg_features_clusters + meta_test['vgg_features_clusters'] = 'NaN' + meta = pd.concat([meta_train, meta_test], axis=0) + + metadata_filepath = os.path.join(params.meta_dir, 'stage{}_metadata.csv').format(params.competition_stage) + logger.info('saving metadata to {}'.format(metadata_filepath)) + meta.to_csv(metadata_filepath, index=None) @action.command() def prepare_masks(): - logger.info('overlaying masks') - overlay_masks(images_dir=params.data_dir, subdir_name='stage1_train', target_dir=params.masks_overlayed_dir) - logger.info('overlaying contours') - overlay_contours(images_dir=params.data_dir, subdir_name='stage1_train', target_dir=params.contours_overlayed_dir) - overlay_contours(images_dir=params.data_dir, subdir_name='stage1_train', - target_dir=params.contours_touching_overlayed_dir, touching_only=True) - logger.info('overlaying centers') - overlay_centers(images_dir=params.data_dir, subdir_name='stage1_train', target_dir=params.centers_overlayed_dir) + official_data_dir = params.data_dir + external_data_dirs = glob.glob('{}/*'.format(params.external_data_dirs)) + all_data_dirs = external_data_dirs + [official_data_dir] + for data_dir in all_data_dirs: + logger.info('processing directory {}'.format(data_dir)) + logger.info('overlaying masks') + overlay_masks(images_dir=data_dir, + subdir_name='stage{}_train'.format(params.competition_stage), + target_dir=params.masks_overlayed_dir) + logger.info('overlaying contours') + overlay_contours(images_dir=data_dir, + subdir_name='stage{}_train'.format(params.competition_stage), + target_dir=params.contours_overlayed_dir) + logger.info('overlaying centers') + overlay_centers(images_dir=data_dir, + subdir_name='stage{}_train'.format(params.competition_stage), + target_dir=params.centers_overlayed_dir) @action.command() @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) -@click.option('-v', '--validation_size', help='percentage of training used for validation', default=0.2, required=False) -def train_pipeline(pipeline_name, validation_size): - _train_pipeline(pipeline_name, validation_size) +@click.option('-v', '--validation_size', help='percentage of training used for validation', default=0.1, required=False) +@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) +@click.option('-s', '--simple_cv', help='use simple train test split', is_flag=True, required=False) +def train_pipeline(pipeline_name, validation_size, dev_mode, simple_cv): + _train_pipeline(pipeline_name, validation_size, dev_mode, simple_cv) -def _train_pipeline(pipeline_name, validation_size): +def _train_pipeline(pipeline_name, validation_size, dev_mode, simple_cv): if bool(params.overwrite) and os.path.isdir(params.experiment_dir): shutil.rmtree(params.experiment_dir) - meta = pd.read_csv(os.path.join(params.meta_dir, 'stage1_metadata.csv')) + meta = pd.read_csv(os.path.join(params.meta_dir, 'stage{}_metadata.csv'.format(params.competition_stage))) meta_train = meta[meta['is_train'] == 1] valid_ids = eval(params.valid_category_ids) - meta_train_split, meta_valid_split = train_valid_split(meta_train, validation_size, valid_category_ids=valid_ids) + + if simple_cv: + meta_train_split, meta_valid_split = train_valid_split(meta_train, validation_size, simple_split=True) + else: + meta_train_split, meta_valid_split = train_valid_split(meta_train, validation_size, + valid_category_ids=valid_ids) + + if dev_mode: + meta_train_split = meta_train_split.sample(5, random_state=1234) + meta_valid_split = meta_valid_split.sample(3, random_state=1234) data = {'input': {'meta': meta_train_split, 'meta_valid': meta_valid_split, @@ -83,16 +135,26 @@ def _train_pipeline(pipeline_name, validation_size): @action.command() @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) -@click.option('-v', '--validation_size', help='percentage of training used for validation', default=0.2, required=False) -def evaluate_pipeline(pipeline_name, validation_size): - _evaluate_pipeline(pipeline_name, validation_size) +@click.option('-v', '--validation_size', help='percentage of training used for validation', default=0.1, required=False) +@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) +@click.option('-s', '--simple_cv', help='use simple train test split', is_flag=True, required=False) +def evaluate_pipeline(pipeline_name, validation_size, dev_mode, simple_cv): + _evaluate_pipeline(pipeline_name, validation_size, dev_mode, simple_cv) -def _evaluate_pipeline(pipeline_name, validation_size): - meta = pd.read_csv(os.path.join(params.meta_dir, 'stage1_metadata.csv')) +def _evaluate_pipeline(pipeline_name, validation_size, dev_mode, simple_cv): + meta = pd.read_csv(os.path.join(params.meta_dir, 'stage{}_metadata.csv'.format(params.competition_stage))) meta_train = meta[meta['is_train'] == 1] valid_ids = eval(params.valid_category_ids) - meta_train_split, meta_valid_split = train_valid_split(meta_train, validation_size, valid_category_ids=valid_ids) + + if simple_cv: + meta_train_split, meta_valid_split = train_valid_split(meta_train, validation_size, simple_split=True) + else: + meta_train_split, meta_valid_split = train_valid_split(meta_train, validation_size, + valid_category_ids=valid_ids) + + if dev_mode: + meta_valid_split = meta_valid_split.sample(2, random_state=1234) data = {'input': {'meta': meta_valid_split, 'meta_valid': None, @@ -121,14 +183,23 @@ def _evaluate_pipeline(pipeline_name, validation_size): @action.command() @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) -def predict_pipeline(pipeline_name): - _predict_pipeline(pipeline_name) - - -def _predict_pipeline(pipeline_name): - meta = pd.read_csv(os.path.join(params.meta_dir, 'stage1_metadata.csv')) +@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) +@click.option('-c', '--chunk_size', help='size of the chunks to run prediction on', type=int, default=None, + required=False) +def predict_pipeline(pipeline_name, dev_mode, chunk_size): + if chunk_size is not None: + _predict_in_chunks_pipeline(pipeline_name, dev_mode, chunk_size) + else: + _predict_pipeline(pipeline_name, dev_mode) + + +def _predict_pipeline(pipeline_name, dev_mode): + meta = pd.read_csv(os.path.join(params.meta_dir, 'stage{}_metadata.csv'.format(params.competition_stage))) meta_test = meta[meta['is_train'] == 0] + if dev_mode: + meta_test = meta_test.sample(2, random_state=1234) + data = {'input': {'meta': meta_test, 'meta_valid': None, 'train_mode': False, @@ -142,39 +213,85 @@ def _predict_pipeline(pipeline_name): pipeline.clean_cache() y_pred = output['y_pred'] - create_submission(params.experiment_dir, meta_test, y_pred, logger) + submission = create_submission(meta_test, y_pred, logger) + + submission_filepath = os.path.join(params.experiment_dir, 'submission.csv') + submission.to_csv(submission_filepath, index=None, encoding='utf-8') + logger.info('submission saved to {}'.format(submission_filepath)) + logger.info('submission head \n\n{}'.format(submission.head())) + + +def _predict_in_chunks_pipeline(pipeline_name, dev_mode, chunk_size): + meta = pd.read_csv(os.path.join(params.meta_dir, 'stage{}_metadata.csv'.format(params.competition_stage))) + meta_test = meta[meta['is_train'] == 0] + + if dev_mode: + meta_test = meta_test.sample(9, random_state=1234) + + logger.info('processing metadata of shape {}'.format(meta_test.shape)) + + submission_chunks = [] + for meta_chunk in generate_data_frame_chunks(meta_test, chunk_size): + data = {'input': {'meta': meta_chunk, + 'meta_valid': None, + 'train_mode': False, + 'target_sizes': meta_chunk[SIZE_COLUMNS].values + }, + } + + pipeline = PIPELINES[pipeline_name]['inference'](SOLUTION_CONFIG) + pipeline.clean_cache() + output = pipeline.transform(data) + pipeline.clean_cache() + y_pred = output['y_pred'] + + submission_chunk = create_submission(meta_chunk, y_pred, logger) + submission_chunks.append(submission_chunk) + + submission = pd.concat(submission_chunks, axis=0) + + submission_filepath = os.path.join(params.experiment_dir, 'submission.csv') + submission.to_csv(submission_filepath, index=None, encoding='utf-8') + logger.info('submission saved to {}'.format(submission_filepath)) + logger.info('submission head \n\n{}'.format(submission.head())) @action.command() @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) @click.option('-v', '--validation_size', help='percentage of training used for validation', default=0.1, required=False) -def train_evaluate_predict_pipeline(pipeline_name, validation_size): +@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) +@click.option('-s', '--simple_cv', help='use simple train test split', is_flag=True, required=False) +def train_evaluate_predict_pipeline(pipeline_name, validation_size, dev_mode, simple_cv): logger.info('training') - _train_pipeline(pipeline_name, validation_size) + _train_pipeline(pipeline_name, validation_size, dev_mode, simple_cv) logger.info('evaluating') - _evaluate_pipeline(pipeline_name, validation_size) + _evaluate_pipeline(pipeline_name, validation_size, dev_mode, simple_cv) logger.info('predicting') - _predict_pipeline(pipeline_name) + _predict_pipeline(pipeline_name, dev_mode) @action.command() @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) @click.option('-v', '--validation_size', help='percentage of training used for validation', default=0.1, required=False) -def train_evaluate_pipeline(pipeline_name, validation_size): +@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) +@click.option('-s', '--simple_cv', help='use simple train test split', is_flag=True, required=False) +def train_evaluate_pipeline(pipeline_name, validation_size, dev_mode, simple_cv): logger.info('training') - _train_pipeline(pipeline_name, validation_size) + _train_pipeline(pipeline_name, validation_size, dev_mode, simple_cv) logger.info('evaluating') - _evaluate_pipeline(pipeline_name, validation_size) + _evaluate_pipeline(pipeline_name, validation_size, dev_mode, simple_cv) @action.command() @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) @click.option('-v', '--validation_size', help='percentage of training used for validation', default=0.1, required=False) -def evaluate_predict_pipeline(pipeline_name, validation_size): +@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) +@click.option('-s', '--simple_cv', help='use simple train test split', is_flag=True, required=False) +def evaluate_predict_pipeline(pipeline_name, validation_size, dev_mode, simple_cv): logger.info('evaluating') - _evaluate_pipeline(pipeline_name, validation_size) + _evaluate_pipeline(pipeline_name, validation_size, dev_mode, simple_cv) logger.info('predicting') - _predict_pipeline(pipeline_name) + _predict_pipeline(pipeline_name, dev_mode) if __name__ == "__main__": diff --git a/metrics.py b/metrics.py index c16f178..600878f 100644 --- a/metrics.py +++ b/metrics.py @@ -1,8 +1,9 @@ import numpy as np -from sklearn.metrics.pairwise import pairwise_distances from tqdm import tqdm -from utils import decompose +from utils import decompose, get_logger + +logger = get_logger() def iou(gt, pred): @@ -23,7 +24,7 @@ def compute_ious(gt, predictions): predictions_ = decompose(predictions) gt_ = np.asarray([el.flatten() for el in gt_]) predictions_ = np.asarray([el.flatten() for el in predictions_]) - ious = pairwise_distances(X=gt_, Y=predictions_, metric=iou) + ious = calculate_iou_matrix(gt_, predictions_) return ious @@ -49,7 +50,6 @@ def intersection_over_union(y_true, y_pred): iou = compute_ious(y_t, y_p) iou_mean = 1.0 * np.sum(iou) / iou.shape[0] ious.append(iou_mean) - return np.mean(ious) @@ -58,3 +58,18 @@ def intersection_over_union_thresholds(y_true, y_pred): for y_t, y_p in tqdm(list(zip(y_true, y_pred))): iouts.append(compute_eval_metric(y_t, y_p)) return np.mean(iouts) + + +def calculate_iou_matrix(ground_truth, proposals): + mat = np.zeros([len(ground_truth),len(proposals)]) + used_proposals = [] + for i, gt in enumerate(ground_truth): + for j,prop in enumerate(proposals): + if j in used_proposals: + continue + iou_ = iou(gt,prop) + mat[i,j]=iou_ + if iou_>0.5: + used_proposals.append(j) + break + return mat diff --git a/models.py b/models.py index 515f6c7..338316b 100644 --- a/models.py +++ b/models.py @@ -1,13 +1,15 @@ +from functools import partial + import numpy as np -import torch.optim as optim +from torch import optim +from callbacks import NeptuneMonitorSegmentation from steps.pytorch.architectures.unet import UNet, UNetMultitask from steps.pytorch.callbacks import CallbackList, TrainingMonitor, ValidationMonitor, ModelCheckpoint, \ ExperimentTiming, ExponentialLRScheduler, EarlyStopping from steps.pytorch.models import Model from steps.pytorch.validation import segmentation_loss from utils import sigmoid -from callbacks import NeptuneMonitorSegmentation class PyTorchUNet(Model): @@ -33,12 +35,23 @@ def __init__(self, architecture_config, training_config, callbacks_config): super().__init__(architecture_config, training_config, callbacks_config) self.model = UNetMultitask(**architecture_config['model_params']) self.weight_regularization = weight_regularization_unet - self.optimizer = optim.Adam(self.weight_regularization(self.model, **architecture_config['regularizer_params']), + self.optimizer = optim.Adam(self.weight_regularization(self.model, + **architecture_config['regularizer_params']), **architecture_config['optimizer_params']) - self.loss_function = [('mask', segmentation_loss, 0.45), - ('contour', segmentation_loss, 0.45), - ('contour_touching', segmentation_loss, 0.0), - ('center', segmentation_loss, 0.1)] + + mask_loss = partial(segmentation_loss, + weight_bce=architecture_config['loss_weights']['bce_mask'], + weight_dice=architecture_config['loss_weights']['dice_mask']) + contour_loss = partial(segmentation_loss, + weight_bce=architecture_config['loss_weights']['bce_contour'], + weight_dice=architecture_config['loss_weights']['dice_contour']) + center_loss = partial(segmentation_loss, + weight_bce=architecture_config['loss_weights']['bce_center'], + weight_dice=architecture_config['loss_weights']['dice_center']) + + self.loss_function = [('mask', mask_loss, architecture_config['loss_weights']['mask']), + ('contour', contour_loss, architecture_config['loss_weights']['contour']), + ('center', center_loss, architecture_config['loss_weights']['center'])] self.callbacks = callbacks_unet(self.callbacks_config) def transform(self, datagen, validation_datagen=None): diff --git a/neptune.yaml b/neptune.yaml index 753e3e7..7f52ca7 100644 --- a/neptune.yaml +++ b/neptune.yaml @@ -1,7 +1,7 @@ project-key: DSB name: dsb_open_solution -tags: [solution_4, unet-multitask, touching_edges] +tags: [solution_5, unet-multitask, patching] metric: channel: 'Final Validation Score' @@ -11,66 +11,103 @@ metric: pip-requirements-file: requirements.txt exclude: + - .git + - .idea + - .ipynb_checkpoints - output - imgs - neptune.log - offline_job.log - - .git - - .idea - - .ipynb_checkpoints - - devbook.ipynb - - morphological_postprocessing.ipynb + - notebooks parameters: # Cloud Environment - data_dir: /public/dsb_2018_data/ - meta_dir: /public/dsb_2018_data/ - masks_overlayed_dir: /public/dsb_2018_data/masks_overlayed/ - contours_overlayed_dir: /public/dsb_2018_data/contours_overlayed/ - contours_touching_overlayed_dir: /public/dsb_2018_data/contours_touching_overlayed/ - centers_overlayed_dir: /public/dsb_2018_data/centers_overlayed/ - experiment_dir: /output/dsb/experiments/ + data_dir: /public/dsb_2018_data/ + meta_dir: /public/dsb_2018_data/ + external_data_dirs: /public/dsb_2018_data/external_data/ + masks_overlayed_dir: /public/dsb_2018_data/masks_overlayed/ + contours_overlayed_dir: /public/dsb_2018_data/contours_overlayed/ + centers_overlayed_dir: /public/dsb_2018_data/centers_overlayed/ + experiment_dir: /output/dsb/experiments/ # Local Environment -# data_dir: /path/to/data -# meta_dir: /path/to/data -# masks_overlayed_dir: /path/to/masks_overlayed -# contours_overlayed_dir: /path/to/contours_overlayed -# contours_touching_overlayed_dir: /path/to/contours_touching_overlayed/ -# centers_overlayed_dir: /path/to/centers_overlayed -# experiment_dir: /path/to/work/dir - - valid_category_ids: '[0, 4]' +# data_dir: /path/to/data +# meta_dir: /path/to/data +# external_data_dirs: /path/to/external/data +# masks_overlayed_dir: /path/to/masks_overlayed +# contours_overlayed_dir: /path/to/contours_overlayed +# centers_overlayed_dir: /path/to/centers_overlayed +# experiment_dir: /path/to/work/dir + +# General parameters + valid_category_ids: '[0, 1]' overwrite: 1 num_workers: 4 load_in_memory: 1 pin_memory: 1 + use_patching: 1 + patching_stride: 256 + +# Image parameters (size estimator) + size_estimator__image_h: 512 + size_estimator__image_w: 512 + size_estimator__image_channels: 1 + +# U-Net parameters (size estimator) + size_estimator__nr_unet_outputs: 3 + size_estimator__n_filters: 16 + size_estimator__conv_kernel: 3 + size_estimator__pool_kernel: 3 + size_estimator__pool_stride: 2 + size_estimator__repeat_blocks: 4 -# General Params +# U-Net loss weights (size estimator) + size_estimator__mask: 0.75 + size_estimator__contour: 1.0 + size_estimator__center: 0.25 + size_estimator__bce_mask: 1.0 + size_estimator__dice_mask: 1.0 + size_estimator__bce_contour: 1.0 + size_estimator__dice_contour: 1.0 + size_estimator__bce_center: 1.0 + size_estimator__dice_center: 1.0 + +# Image parameters (multi-output) image_h: 512 image_w: 512 - image_channels: 3 + image_channels: 1 -# U-Net parameters - nr_unet_outputs: 4 +# U-Net parameters (multi-output) + nr_unet_outputs: 3 n_filters: 16 conv_kernel: 3 pool_kernel: 3 pool_stride: 2 repeat_blocks: 4 +# U-Net loss weights (multi-output) + mask: 0.75 + contour: 1.0 + center: 0.25 + bce_mask: 1.0 + dice_mask: 1.0 + bce_contour: 1.0 + dice_contour: 1.0 + bce_center: 1.0 + dice_center: 1.0 + # Training schedule epochs_nr: 1000 batch_size_train: 4 batch_size_inference: 4 - lr: 0.0001 + lr: 0.0002 momentum: 0.9 - gamma: 0.99 - patience: 100 + gamma: 1.0 + patience: 50 # Regularization use_batch_norm: 1 - l2_reg_conv: 0.0001 + l2_reg_conv: 0.00005 l2_reg_dense: 0.0 dropout_conv: 0.1 dropout_dense: 0.0 diff --git a/notebooks/dataset_of_artifacts.ipynb b/notebooks/dataset_of_artifacts.ipynb new file mode 100644 index 0000000..3ead38c --- /dev/null +++ b/notebooks/dataset_of_artifacts.ipynb @@ -0,0 +1,126 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dataset of non nuclei images\n", + "\n", + "We prepared a small dataset of images that contain no nuclei and can be added to training in order to help your model deal with some artifacts." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import math\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.externals import joblib\n", + "\n", + "def plot_list(images=[], labels=[], n_rows=1):\n", + " n_img = len(images)\n", + " n_lab = len(labels)\n", + " n_cols = math.ceil((n_lab+n_img)/n_rows)\n", + " plt.figure(figsize=(12,10))\n", + " for i, image in enumerate(images):\n", + " plt.subplot(n_rows,n_cols,i+1)\n", + " plt.imshow(image)\n", + " for j, label in enumerate(labels):\n", + " plt.subplot(n_rows,n_cols,n_img+j+1)\n", + " plt.imshow(label, cmap='nipy_spectral')\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "non_nuclei_images = joblib.load('/mnt/ml-team/minerva/dsb_2018_data/kaggle_kernels_data/non_nuclei_images.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_list(non_nuclei_images, n_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Full pipeline\n", + "If you would like to see how we plugged this data in just go to our [open solution](https://github.com/neptune-ml/open-solution-data-science-bowl-2018)\n", + "\n", + "![full open solution pipeline](https://gist.githubusercontent.com/jakubczakon/10e5eb3d5024cc30cdb056d5acd3d92f/raw/e85c1da3acfe96123d0ff16f8145913ee65e938c/full_pipeline.png)\n", + "\n", + "In the `main.py` of the `dev-patching` branch we have a function that generates metadata that looks like this:\n", + "\n", + "```python\n", + "@action.command()\n", + "def prepare_metadata():\n", + " logger.info('creating metadata')\n", + " meta = generate_metadata(data_dir=params.data_dir,\n", + " masks_overlayed_dir=params.masks_overlayed_dir,\n", + " contours_overlayed_dir=params.contours_overlayed_dir,\n", + " contours_touching_overlayed_dir=params.contours_touching_overlayed_dir,\n", + " centers_overlayed_dir=params.centers_overlayed_dir)\n", + " logger.info('calculating clusters')\n", + " meta_train = meta[meta['is_train'] == 1]\n", + " meta_test = meta[meta['is_train'] == 0]\n", + " vgg_features_clusters = get_vgg_clusters(meta_train)\n", + " meta_train['vgg_features_clusters'] = vgg_features_clusters\n", + " meta_test['vgg_features_clusters'] = 'NaN'\n", + " meta = pd.concat([meta_train, meta_test], axis=0)\n", + "\n", + " logger.info('adding artifacts metadata')\n", + " meta_artifacts = build_artifacts_metadata(artifacts_dir=params.artifacts_dir)\n", + " meta = pd.concat([meta, meta_artifacts], axis=0)\n", + "\n", + " meta.to_csv(os.path.join(params.meta_dir, 'stage1_metadata.csv'), index=None)\n", + "```\n", + "\n", + "Feel free to use it in your solution.\n", + "Good luck!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dl_py3", + "language": "python", + "name": "dl_py3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/kaggle_morhpological_postprocessing.ipynb b/notebooks/morphological_postprocessing.ipynb similarity index 99% rename from kaggle_morhpological_postprocessing.ipynb rename to notebooks/morphological_postprocessing.ipynb index 41b1736..c0360ec 100644 --- a/kaggle_morhpological_postprocessing.ipynb +++ b/notebooks/morphological_postprocessing.ipynb @@ -1511,9 +1511,9 @@ ], "metadata": { "kernelspec": { - "display_name": "dl_py3", + "display_name": "Python 3", "language": "python", - "name": "dl_py3" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/notebooks/stain_deconvolution.ipynb b/notebooks/stain_deconvolution.ipynb new file mode 100644 index 0000000..cafc7eb --- /dev/null +++ b/notebooks/stain_deconvolution.ipynb @@ -0,0 +1,246 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Stain deconvolution\n", + "\n", + "There are a few modalities of images in the dataset. \n", + "The majority of images is actually greyscale broadcasted to 3 channels. \n", + "The rest of the images the *actual* rgb images are stained with hematoxylin and eosin (atleast to my knowledge).\n", + "\n", + "One can use that information to do stain deconvolution on rgb images and transform the dataset so that all images have just 1 intensity channel. We will approach this problem in this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import numpy as np\n", + "import math\n", + "import matplotlib.pyplot as plt\n", + "from skimage.color import rgb2grey, rgb2hed\n", + "from skimage.exposure import rescale_intensity\n", + "from sklearn.externals import joblib\n", + "\n", + "\n", + "def plot_list(images=[], labels=[], n_rows=1):\n", + " n_img = len(images)\n", + " n_lab = len(labels)\n", + " n_cols = math.ceil((n_lab+n_img)/n_rows)\n", + " plt.figure(figsize=(12,10))\n", + " for i, image in enumerate(images):\n", + " plt.subplot(n_rows,n_cols,i+1)\n", + " plt.imshow(image)\n", + " for j, label in enumerate(labels):\n", + " plt.subplot(n_rows,n_cols,n_img+j+1)\n", + " plt.imshow(label, cmap='nipy_spectral')\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample_images = joblib.load('/mnt/ml-team/minerva/dsb_2018_data/kaggle_kernels_data/sample_stained_not_stained_images.pkl')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at some example images from the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_list(sample_images,n_rows=4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Stained Image filter\n", + "\n", + "Before we extract hematoxylin and eosin staining we need to first filter the rgb images. \n", + "Very simple approach is to do the following:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def is_stained(img):\n", + " red_mean, green_mean, blue_mean = img.mean(axis=(0, 1))\n", + " if red_mean == green_mean == blue_mean:\n", + " return False\n", + " else:\n", + " return True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Deconvolution\n", + "Now that we have we will extract those h and e channels from images with the use of `rgb2hed` function. \n", + "\n", + "Let's see what it does:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for img in sample_images:\n", + " if is_stained(img):\n", + " img_hed = rgb2hed(img)\n", + " img_hematoxilin = img_hed[:,:,0]\n", + " img_eosin = img_hed[:,:,1]\n", + " img_dab = img_hed[:,:,2]\n", + "\n", + " plot_list([img, img_hematoxilin, img_eosin, img_dab])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can write a function that takes hematoxyli and eosin channels and combines them together.\n", + "We will parametrize which channels user wants to use as well." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def stain_deconvolve(img, mode='hematoxylin_eosin_sum'):\n", + " img_hed = rgb2hed(img)\n", + " if mode == 'hematoxylin_eosin_sum':\n", + " h, w = img.shape[:2]\n", + " img_hed = rgb2hed(img)\n", + " img_he_sum = np.zeros((h, w, 2))\n", + " img_he_sum[:, :, 0] = rescale_intensity(img_hed[:, :, 0], out_range=(0, 1))\n", + " img_he_sum[:, :, 1] = rescale_intensity(img_hed[:, :, 1], out_range=(0, 1))\n", + " img_deconv = rescale_intensity(img_he_sum.sum(axis=2), out_range=(0, 1))\n", + " elif mode == 'hematoxylin':\n", + " img_deconv = img_hed[:, :, 0]\n", + " elif mode == 'eosin':\n", + " img_deconv = img_hed[:, :, 1]\n", + " else:\n", + " raise NotImplementedError('only hematoxylin_eosin_sum, hematoxylin, eosin modes are supported')\n", + " return img_deconv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see the results and compare how does this intensity differs from taking a simple greyscale." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "for img in sample_images:\n", + " if is_stained(img):\n", + " deconv = stain_deconvolve(img)\n", + " grey = 1-rgb2grey(img)\n", + " plot_list([img, grey, deconv])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The difference is not huge but for some images, for instance the 3rd image we were able to extract a cleaner image with `stain_deconvolve` than with `greyscale`. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Full pipeline\n", + "If you would like to see how we plugged stain deconvolution into our pipeline go to [open solution](https://github.com/neptune-ml/open-solution-data-science-bowl-2018)\n", + "\n", + "![full open solution pipeline](https://gist.githubusercontent.com/jakubczakon/10e5eb3d5024cc30cdb056d5acd3d92f/raw/e85c1da3acfe96123d0ff16f8145913ee65e938c/full_pipeline.png)\n", + "\n", + "The stain deconvolution step is defined in the `preprocessing.py` file:\n", + "\n", + "```python \n", + "\n", + "class StainDeconvolution(BaseTransformer):\n", + " def __init__(self, mode):\n", + " self.mode = mode\n", + "\n", + " def transform(self, X):\n", + " X_deconvoled = []\n", + " for x in X[0]:\n", + " x = from_pil(x)\n", + " if is_stained(x):\n", + " x_deconv = (stain_deconvolve(x, mode=self.mode) * 255).astype(np.uint8)\n", + " else:\n", + " x_deconv = (rgb2grey(x) * 255).astype(np.uint8)\n", + " x_deconv = to_pil(x_deconv)\n", + " X_deconvoled.append(x_deconv)\n", + " return {'X': [X_deconvoled]}\n", + "```\n", + "\n", + "If you want to use our implementation just go for it!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dl_py3", + "language": "python", + "name": "dl_py3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/trainable_rescale.ipynb b/notebooks/trainable_rescale.ipynb new file mode 100644 index 0000000..a179aea --- /dev/null +++ b/notebooks/trainable_rescale.ipynb @@ -0,0 +1,175 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Trainable Image Resize\n", + "\n", + "It was mentioned in the forums https://www.kaggle.com/c/data-science-bowl-2018/discussion/52766 that the fact that images are at different scales can be a significant problem. \n", + "It was also pointed that people dealt with it before in the following manner:\n", + "\n", + "* train network on raw images\n", + "\n", + "* run image through the net\n", + "\n", + "* estimate object size\n", + "\n", + "* resize based on the object size estimate\n", + "\n", + "* train new network on resize images\n", + "\n", + "We decided to implement it in the following way:\n", + "\n", + "* train unet for mask and contour prediction\n", + "\n", + "* do some morphological postprocessing mask+contour\n", + "\n", + "* estimage size by:\n", + "\n", + "```python\n", + "class CellSizer(BaseTransformer):\n", + " def __init__(self, **kwargs):\n", + " pass\n", + "\n", + " def transform(self, labeled_images):\n", + " mean_sizes = []\n", + " for image in tqdm(labeled_images):\n", + " mean_size = mean_cell_size(image)\n", + " mean_sizes.append(mean_size)\n", + " return {'sizes': mean_sizes}\n", + " \n", + "def mean_cell_size(labeled_image):\n", + " blob_sizes = itemfreq(labeled_image)\n", + " if blob_sizes.shape[0]==1:\n", + " return 0\n", + " else:\n", + " blob_sizes = blob_sizes[blob_sizes[:, 0].argsort()][1:, 1]\n", + " return np.mean(blob_sizes)\n", + " \n", + "```\n", + "\n", + "* rescaling the image (assuming certain boundaries) with\n", + "\n", + "```python \n", + "\n", + "class ImageReaderRescaler(BaseTransformer):\n", + " def __init__(self, min_size, max_size, target_ratio):\n", + " self.min_size = min_size\n", + " self.max_size = max_size\n", + " self.target_ratio = target_ratio\n", + "\n", + " def _transform(self, sizes, X, y=None, meta=None):\n", + " raw_images = X[0]\n", + " raw_images_adj = []\n", + " for size, raw_image in tqdm(zip(sizes, raw_images)):\n", + " h_adj, w_adj = self._get_adjusted_image_size(size, from_pil(raw_image))\n", + " raw_image_adj = resize(from_pil(raw_image), (h_adj, w_adj), \n", + " preserve_range=True).astype(np.uint8)\n", + " raw_images_adj.append(to_pil(raw_image_adj))\n", + " X_adj = [raw_images_adj]\n", + " ...\n", + " return X_adj, y_adj\n", + "\n", + " def _get_adjusted_image_size(self, mean_cell_size, img):\n", + " h, w = img.shape[:2]\n", + " img_area = h * w\n", + " \n", + " if mean_cell_size ==0:\n", + " adj_ratio = 1.0\n", + " else:\n", + " size_ratio = img_area / mean_cell_size\n", + " adj_ratio = size_ratio / self.target_ratio\n", + "\n", + " h_adj = int(clip(self.min_size, h * adj_ratio, self.max_size))\n", + " w_adj = int(clip(self.min_size, w * adj_ratio, self.max_size))\n", + "\n", + " return h_adj, w_adj\n", + "```\n", + "\n", + "* Finally on such rescaled images we train and predict by using patches of fixed size (say 512x512). For example inference can be done with something like this:\n", + "\n", + "```python\n", + "class PatchCombiner(BaseTransformer):\n", + "```\n", + " ...\n", + "```python\n", + " def _join_output(self, patch_meta, image_patches):\n", + " image_h = patch_meta['image_h'].unique()[0]\n", + " image_w = patch_meta['image_w'].unique()[0]\n", + " prediction_image = np.zeros((image_h, image_w))\n", + " prediction_image_padded = get_mosaic_padded_image(prediction_image, \n", + " self.patching_size, \n", + " self.patching_stride)\n", + "\n", + " patches_per_image = 0\n", + " for (y_coordinate, \n", + " x_coordinate, \n", + " tta_angle), image_patch in zip(patch_meta[['y_coordinates', \n", + " 'x_coordinates', \n", + " 'tta_angles']].values.tolist(), \n", + " image_patches):\n", + " patches_per_image += 1\n", + " image_patch = np.rot90(image_patch, -1 * tta_angle / 90.)\n", + " (window_y, \n", + " window_x) = y_coordinate * self.patching_stride, x_coordinate * self.patching_stride\n", + " prediction_image_padded[window_y:self.patching_size + window_y,\n", + " window_x:self.patching_size + window_x] += image_patch\n", + "\n", + " _, h_top, h_bottom, _ = get_padded_size(max(image_h, self.patching_size),\n", + " self.patching_size,\n", + " self.patching_stride)\n", + " _, w_left, w_right, _ = get_padded_size(max(image_w, self.patching_size),\n", + " self.patching_size,\n", + " self.patching_stride)\n", + "\n", + " prediction_image = prediction_image_padded[h_top:-h_bottom, w_left:-w_right]\n", + " prediction_image /= self.normalization_factor\n", + " return prediction_image\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Full pipeline\n", + "If you would like to see how we plugged trainable rescale into our pipeline go to [open solution](https://github.com/neptune-ml/open-solution-data-science-bowl-2018)\n", + "\n", + "![full open solution pipeline](https://gist.githubusercontent.com/jakubczakon/10e5eb3d5024cc30cdb056d5acd3d92f/raw/e85c1da3acfe96123d0ff16f8145913ee65e938c/full_pipeline.png)\n", + "\n", + "The `ImageReaderRescaler` step is defined in the `preprocessing.py` file:\n", + "\n", + "If you want to use our implementation just go for it!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dl_py3", + "language": "python", + "name": "dl_py3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pipeline_config.py b/pipeline_config.py index 9ec6233..04bb525 100644 --- a/pipeline_config.py +++ b/pipeline_config.py @@ -1,7 +1,7 @@ import os -from deepsense import neptune from attrdict import AttrDict +from deepsense import neptune from utils import read_params @@ -11,10 +11,7 @@ SIZE_COLUMNS = ['height', 'width'] X_COLUMNS = ['file_path_image'] Y_COLUMNS = ['file_path_mask'] -Y_COLUMNS_MULTITASK = ['file_path_mask', - 'file_path_contours', - 'file_path_contours_touching', - 'file_path_centers'] +Y_COLUMNS_MULTITASK = ['file_path_mask', 'file_path_contours', 'file_path_centers'] Y_COLUMNS_SCORING = ['file_path_masks'] GLOBAL_CONFIG = {'exp_root': params.experiment_dir, @@ -37,14 +34,18 @@ }, 'reader_single': {'x_columns': X_COLUMNS, 'y_columns': Y_COLUMNS, - 'target_shape': GLOBAL_CONFIG['img_H-W'] }, 'reader_multitask': {'x_columns': X_COLUMNS, 'y_columns': Y_COLUMNS_MULTITASK, - 'target_shape': GLOBAL_CONFIG['img_H-W'] }, + 'reader_rescaler': {'min_size': params.image_h, + 'max_size': 2000, + 'target_ratio': 200}, + 'stain_deconvolution': {'mode': 'hematoxylin_eosin_sum'}, 'loader': {'dataset_params': {'h': params.image_h, 'w': params.image_w, + 'use_patching': params.use_patching, + 'patching_stride': params.patching_stride }, 'loader_params': {'training': {'batch_size': params.batch_size_train, 'shuffle': True, @@ -58,6 +59,56 @@ }, }, }, + 'patch_combiner': {'patching_size': params.image_h, + 'patching_stride': params.patching_stride}, + 'unet_size_estimator': { + 'architecture_config': {'model_params': {'n_filters': params.size_estimator__n_filters, + 'conv_kernel': params.size_estimator__conv_kernel, + 'pool_kernel': params.size_estimator__pool_kernel, + 'pool_stride': params.size_estimator__pool_stride, + 'repeat_blocks': params.size_estimator__repeat_blocks, + 'batch_norm': params.use_batch_norm, + 'dropout': params.dropout_conv, + 'in_channels': params.size_estimator__image_channels, + 'nr_outputs': params.size_estimator__nr_unet_outputs + }, + 'optimizer_params': {'lr': params.lr, + }, + 'regularizer_params': {'regularize': True, + 'weight_decay_conv2d': params.l2_reg_conv, + }, + 'weights_init': {'function': 'he', + }, + 'loss_weights': {'bce_mask': params.size_estimator__bce_mask, + 'dice_mask': params.size_estimator__dice_mask, + 'bce_contour': params.size_estimator__bce_contour, + 'dice_contour': params.size_estimator__dice_contour, + 'bce_center': params.size_estimator__bce_center, + 'dice_center': params.size_estimator__dice_center, + 'mask': params.size_estimator__mask, + 'contour': params.size_estimator__contour, + 'center': params.size_estimator__center, + }, + }, + 'training_config': {'epochs': params.epochs_nr, + }, + 'callbacks_config': { + 'model_checkpoint': { + 'filepath': os.path.join(GLOBAL_CONFIG['exp_root'], 'checkpoints', 'unet_size_estimator', 'best.torch'), + 'epoch_every': 1}, + 'lr_scheduler': {'gamma': params.gamma, + 'epoch_every': 1}, + 'training_monitor': {'batch_every': 0, + 'epoch_every': 1}, + 'experiment_timing': {'batch_every': 0, + 'epoch_every': 1}, + 'validation_monitor': {'epoch_every': 1}, + 'neptune_monitor': {'model_name': 'unet', + 'image_nr': 4, + 'image_resize': 0.2}, + 'early_stopping': {'patience': params.patience}, + }, + }, 'unet': { 'architecture_config': {'model_params': {'n_filters': params.n_filters, 'conv_kernel': params.conv_kernel, @@ -74,16 +125,24 @@ 'regularizer_params': {'regularize': True, 'weight_decay_conv2d': params.l2_reg_conv, }, - 'weights_init': {'function': 'xavier', + 'weights_init': {'function': 'he', + }, + 'loss_weights': {'bce_mask': params.bce_mask, + 'dice_mask': params.dice_mask, + 'bce_contour': params.bce_contour, + 'dice_contour': params.dice_contour, + 'bce_center': params.bce_center, + 'dice_center': params.dice_center, + 'mask': params.mask, + 'contour': params.contour, + 'center': params.center, }, }, 'training_config': {'epochs': params.epochs_nr, - 'shuffle': True, - 'batch_size': params.batch_size_train, }, 'callbacks_config': { 'model_checkpoint': { - 'filepath': os.path.join(GLOBAL_CONFIG['exp_root'], 'checkpoints', 'network', 'best.torch'), + 'filepath': os.path.join(GLOBAL_CONFIG['exp_root'], 'checkpoints', 'unet', 'best.torch'), 'epoch_every': 1}, 'lr_scheduler': {'gamma': params.gamma, 'epoch_every': 1}, @@ -99,7 +158,6 @@ }, }, 'thresholder': {'threshold': params.threshold}, - 'watershed': {}, 'dropper': {'min_size': params.min_nuclei_size}, 'postprocessor': {} }) diff --git a/pipelines.py b/pipelines.py index 8bae601..09e0e22 100644 --- a/pipelines.py +++ b/pipelines.py @@ -1,28 +1,27 @@ from functools import partial -from loaders import MetadataImageSegmentationLoader, MetadataImageSegmentationMultitaskLoader, \ - ImageSegmentationMultitaskLoader, ImageSegmentationLoader +import loaders from models import PyTorchUNet, PyTorchUNetMultitask -from postprocessing import Resizer, Thresholder, NucleiLabeler, Dropper, \ - WatershedCenter, WatershedContour, BinaryFillHoles, Postprocessor -from steps.base import Step, Dummy -from steps.preprocessing import XYSplit, ImageReader +from postprocessing import Resizer, Thresholder, NucleiLabeler, Postprocessor, CellSizer +from preprocessing import ImageReaderRescaler, ImageReader, StainDeconvolution +from steps.base import Step, Dummy, to_dict_inputs +from steps.preprocessing import XYSplit from utils import squeeze_inputs def unet(config, train_mode): if train_mode: - save_output = True + save_output = False load_saved_output = False - preprocessing = preprocessing_train(config) else: - save_output = True + save_output = False load_saved_output = False - preprocessing = preprocessing_inference(config) + + loader = preprocessing(config, model_type='single', is_train=train_mode) unet = Step(name='unet', transformer=PyTorchUNet(**config.unet), - input_steps=[preprocessing], + input_steps=[loader], cache_dirpath=config.env.cache_dirpath, save_output=save_output, load_saved_output=load_saved_output) @@ -41,139 +40,465 @@ def unet(config, train_mode): def unet_multitask(config, train_mode): if train_mode: - save_output = True + save_output = False load_saved_output = False - preprocessing = preprocessing_multitask_train(config) else: - save_output = True + save_output = False load_saved_output = False - preprocessing = preprocessing_multitask_inference(config) - unet_multitask = Step(name='unet_multitask', - transformer=PyTorchUNetMultitask(**config.unet), - input_steps=[preprocessing], + if config.loader.dataset_params.use_patching: + loader = preprocessing(config, model_type='multitask', + is_train=train_mode, + loader_mode='patching_inference') + + unet_multitask_patches = Step(name='unet_multitask', + transformer=PyTorchUNetMultitask(**config.unet), + input_steps=[loader], + adapter={'datagen': ([(loader.name, 'datagen')]), + 'validation_datagen': ([(loader.name, 'validation_datagen')]), + }, + cache_dirpath=config.env.cache_dirpath, + cache_output=False, + save_output=False, + load_saved_output=False) + + unet_multitask = Step(name='patch_joiner', + transformer=loaders.PatchCombiner(**config.patch_combiner), + input_steps=[unet_multitask_patches, loader], + adapter={'patch_ids': ([(loader.name, 'patch_ids')]), + 'outputs': ([(unet_multitask_patches.name, 'mask_prediction'), + (unet_multitask_patches.name, 'contour_prediction'), + (unet_multitask_patches.name, 'center_prediction')], + partial(to_dict_inputs, keys=['mask_prediction', + 'contour_prediction', + 'center_prediction'])), + }, + cache_dirpath=config.env.cache_dirpath, + cache_output=True, + save_output=True, + load_saved_output=load_saved_output) + else: + loader = preprocessing(config, model_type='multitask', + is_train=train_mode, + loader_mode=None) + + unet_multitask = Step(name='unet_multitask', + transformer=PyTorchUNetMultitask(**config.unet), + input_steps=[loader], + adapter={'datagen': ([(loader.name, 'datagen')]), + 'validation_datagen': ([(loader.name, 'validation_datagen')]), + }, + cache_dirpath=config.env.cache_dirpath, + cache_output=True, + save_output=True, + load_saved_output=False) + + morphological_postprocessing = postprocessing(unet_multitask, unet_multitask, config, + suffix='', save_output=True) + + output = Step(name='output', + transformer=Dummy(), + input_steps=[morphological_postprocessing], + adapter={'y_pred': ([(morphological_postprocessing.name, 'labeled_images')]), + }, + cache_dirpath=config.env.cache_dirpath) + return output + + +def patched_unet_training(config): + reader_train = Step(name='reader', + transformer=ImageReader(**config.reader_multitask), + input_data=['input'], + adapter={'meta': ([('input', 'meta')]), + 'meta_valid': ([('input', 'meta_valid')]), + 'train_mode': ([('input', 'train_mode')]), + }, + cache_dirpath=config.env.cache_dirpath) + deconved_reader_train = add_stain_deconvolution(reader_train, config, cache_output=True, save_output=False, + suffix='') + + reader_valid = Step(name='reader_valid', + transformer=ImageReader(**config.reader_multitask), + input_data=['input'], + adapter={'meta': ([('input', 'meta_valid')]), + 'train_mode': ([('input', 'train_mode')]), + }, + cache_dirpath=config.env.cache_dirpath) + deconved_reader_valid = add_stain_deconvolution(reader_valid, config, cache_output=True, save_output=False, + suffix='_valid') + + reader = Step(name='reader_joined', + transformer=Dummy(), + input_steps=[reader_train, reader_valid, + deconved_reader_train, deconved_reader_valid], + adapter={'X': ([(deconved_reader_train.name, 'X')]), + 'y': ([(deconved_reader_train.name, 'y')]), + 'X_valid': ([(deconved_reader_valid.name, 'X')]), + 'y_valid': ([(deconved_reader_valid.name, 'y')]), + }, + cache_dirpath=config.env.cache_dirpath, + save_output=True, load_saved_output=False) + + unet_multitask = unet_multitask_block(reader, config, config.unet_size_estimator, + loader_mode=None, suffix='_size_estimator') + + return unet_multitask + + +def scale_adjusted_patched_unet_training(config): + reader_train = Step(name='reader', + transformer=ImageReader(**config.reader_multitask), + input_data=['input'], + adapter={'meta': ([('input', 'meta')]), + 'meta_valid': ([('input', 'meta_valid')]), + 'train_mode': ([('input', 'train_mode')]), + }, + cache_dirpath=config.env.cache_dirpath, + cache_output=True) + deconved_reader_train = add_stain_deconvolution(reader_train, config, + cache_output=True, + save_output=False, + suffix='') + + reader_valid = Step(name='reader_valid', + transformer=ImageReader(**config.reader_multitask), + input_data=['input'], + adapter={'meta': ([('input', 'meta_valid')]), + 'train_mode': ([('input', 'train_mode')]), + }, + cache_dirpath=config.env.cache_dirpath, + cache_output=True) + deconved_reader_valid = add_stain_deconvolution(reader_valid, config, + cache_output=True, + save_output=False, + suffix='_valid') + + deconved_reader_train_valid = Step(name='reader_train_valid', + transformer=Dummy(), + input_steps=[deconved_reader_train, deconved_reader_valid], + adapter={'X': ([(deconved_reader_train.name, 'X')]), + 'y': ([(deconved_reader_train.name, 'y')]), + 'X_valid': ([(deconved_reader_valid.name, 'X')]), + 'y_valid': ([(deconved_reader_valid.name, 'y')]), + }, + cache_dirpath=config.env.cache_dirpath, + cache_output=True) + + scale_estimator_train = unet_size_estimator(deconved_reader_train_valid, config, config.unet_size_estimator, + cache_output=True, train_mode=True) + scale_estimator_valid = unet_size_estimator(deconved_reader_valid, config, config.unet_size_estimator, + suffix='_valid', cache_output=True, train_mode=False) + + reader_rescaler_train = Step(name='rescaler', + transformer=ImageReaderRescaler(**config.reader_rescaler), + input_data=['input'], + input_steps=[reader_train, scale_estimator_train], + adapter={'sizes': ([(scale_estimator_train.name, 'sizes')]), + 'X': ([(reader_train.name, 'X')]), + 'y': ([(reader_train.name, 'y')]), + 'meta': ([('input', 'meta')]), + }, + cache_dirpath=config.env.cache_dirpath, + cache_output=True) + deconved_reader_rescaler_train = add_stain_deconvolution(reader_rescaler_train, config, + cache_output=True, + save_output=False, + suffix='_rescaled') + + reader_rescaler_valid = Step(name='rescaler_valid', + transformer=ImageReaderRescaler(**config.reader_rescaler), + input_data=['input'], + input_steps=[reader_valid, scale_estimator_valid], + adapter={'sizes': ([(scale_estimator_valid.name, 'sizes')]), + 'X': ([(reader_valid.name, 'X')]), + 'y': ([(reader_valid.name, 'y')]), + 'meta': ([('input', 'meta_valid')]), + }, + cache_dirpath=config.env.cache_dirpath, + cache_output=True) + deconved_reader_rescaler_valid = add_stain_deconvolution(reader_rescaler_valid, config, + cache_output=True, + save_output=False, + suffix='_rescaled_valid') + + reader_rescaler = Step(name='rescaler_join', + transformer=Dummy(), + input_steps=[deconved_reader_rescaler_train, deconved_reader_rescaler_valid], + adapter={'X': ([(deconved_reader_rescaler_train.name, 'X')]), + + 'y': ([(deconved_reader_rescaler_train.name, 'y')]), + 'X_valid': ([(deconved_reader_rescaler_valid.name, 'X')]), + 'y_valid': ([(deconved_reader_rescaler_valid.name, 'y')]), + }, + cache_dirpath=config.env.cache_dirpath, + cache_output=True, + save_output=True, load_saved_output=True) + + unet_rescaled = unet_multitask_block(reader_rescaler, config, config.unet, + loader_mode='patched_training', + suffix='_rescaled', + force_fitting=True) + + return unet_rescaled + + +def scale_adjusted_patched_unet(config): + reader = Step(name='reader', + transformer=ImageReader(**config.reader_multitask), + input_data=['input'], + adapter={'meta': ([('input', 'meta')]), + 'train_mode': ([('input', 'train_mode')]), + }, + cache_dirpath=config.env.cache_dirpath, + cache_output=True) + deconved_reader = add_stain_deconvolution(reader, config, + cache_output=True, + save_output=False, + suffix='') + scale_estimator = unet_size_estimator(deconved_reader, config, config.unet_size_estimator, + cache_output=True, train_mode=False) + + reader_rescaler = Step(name='rescaler', + transformer=ImageReaderRescaler(**config.reader_rescaler), + input_data=['input'], + input_steps=[reader, scale_estimator], + adapter={'sizes': ([(scale_estimator.name, 'sizes')]), + 'X': ([(reader.name, 'X')]), + 'y': ([(reader.name, 'y')]), + 'meta': ([('input', 'meta')]), + }, + cache_dirpath=config.env.cache_dirpath, + cache_output=True) + deconved_reader = add_stain_deconvolution(reader_rescaler, config, + cache_output=True, + save_output=False, + suffix='_rescaled') + + loader_rescaled = Step(name='loader_rescaled', + transformer=loaders.ImageSegmentationMultitaskLoaderPatchingInference(**config.loader), + input_data=['input'], + input_steps=[deconved_reader], + adapter={'X': ([(deconved_reader.name, 'X')]), + 'y': ([(deconved_reader.name, 'y')]), + 'train_mode': ([('input', 'train_mode')]), + }, + cache_dirpath=config.env.cache_dirpath, + cache_output=True) + + unet_rescaled_patches = Step(name='unet_rescaled', + transformer=PyTorchUNetMultitask(**config.unet), + input_steps=[loader_rescaled], + adapter={'datagen': ([(loader_rescaled.name, 'datagen')]), + 'validation_datagen': ([(loader_rescaled.name, 'validation_datagen')]), + }, + cache_dirpath=config.env.cache_dirpath) + + unet_rescaled = Step(name='patch_joiner', + transformer=loaders.PatchCombiner(**config.patch_combiner), + input_steps=[unet_rescaled_patches, loader_rescaled], + adapter={'patch_ids': ([(loader_rescaled.name, 'patch_ids')]), + 'outputs': ([(unet_rescaled_patches.name, 'mask_prediction'), + (unet_rescaled_patches.name, 'contour_prediction'), + (unet_rescaled_patches.name, 'center_prediction')], + partial(to_dict_inputs, keys=['mask_prediction', + 'contour_prediction', + 'center_prediction'])), + }, + cache_dirpath=config.env.cache_dirpath, + cache_output=True) + + morphological_postprocessing = postprocessing(unet_rescaled, unet_rescaled, config, + suffix='_rescaled', save_output=True) + + output = Step(name='output', + transformer=Dummy(), + input_steps=[morphological_postprocessing], + adapter={'y_pred': ([(morphological_postprocessing.name, 'labeled_images')]), + }, + cache_dirpath=config.env.cache_dirpath) + return output + + +def add_stain_deconvolution(reader, config, cache_output=False, save_output=False, suffix=''): + stain_deconvolution = Step(name='stain_deconvolution{}'.format(suffix), + transformer=StainDeconvolution(**config.stain_deconvolution), + input_steps=[reader], + adapter={'X': ([(reader.name, 'X')]), + }, + cache_dirpath=config.env.cache_dirpath) + + reader = Step(name='reader_with_deconv{}'.format(suffix), + transformer=Dummy(), + input_steps=[reader, stain_deconvolution], + adapter={'X': ([(stain_deconvolution.name, 'X')]), + 'y': ([(reader.name, 'y')]), + }, + cache_dirpath=config.env.cache_dirpath, + cache_output=cache_output, + save_output=save_output) + + return reader + + +def unet_size_estimator(reader, config, config_network, suffix='', cache_output=False, train_mode=True): + unet = unet_multitask_block(reader, config, config_network, loader_mode=None, suffix='_size_estimator', + train_mode=train_mode) + + suffix = '_size_estimator{}'.format(suffix) + + morphological_postprocessing = postprocessing(unet, unet, config, suffix=suffix, cache_output=cache_output) + + cell_sizer = Step(name='cell_sizer{}'.format(suffix), + transformer=CellSizer(), + input_steps=[morphological_postprocessing], + adapter={'labeled_images': ([(morphological_postprocessing.name, 'labeled_images')])}, + cache_dirpath=config.env.cache_dirpath, + cache_output=cache_output + ) + return cell_sizer + + +def unet_multitask_block(reader, config, config_network, loader_mode, force_fitting=False, suffix='', + cache_output=False, train_mode=True): + if loader_mode == 'patching_train': + Loader = loaders.ImageSegmentationMultitaskLoaderPatchingTrain + elif loader_mode == 'patching_inference': + Loader = loaders.ImageSegmentationMultitaskLoaderPatchingInference + else: + Loader = loaders.ImageSegmentationMultitaskLoader + + if train_mode: + adapter_mapping = {'X': ([(reader.name, 'X')]), + 'y': ([(reader.name, 'y')]), + 'train_mode': ([('input', 'train_mode')]), + 'X_valid': ([(reader.name, 'X_valid')]), + 'y_valid': ([(reader.name, 'y_valid')]), + } + else: + adapter_mapping = {'X': ([(reader.name, 'X')]), + 'y': ([(reader.name, 'y')]), + 'train_mode': ([('input', 'train_mode')]), + } + + loader = Step(name='loader{}'.format(suffix), + transformer=Loader(**config.loader), + input_data=['input'], + input_steps=[reader], + adapter=adapter_mapping, + cache_dirpath=config.env.cache_dirpath, + cache_output=cache_output) + + unet_multitask = Step(name='unet{}'.format(suffix), + transformer=PyTorchUNetMultitask(**config_network), + input_steps=[loader], + adapter={'datagen': ([(loader.name, 'datagen')]), + 'validation_datagen': ([(loader.name, 'validation_datagen')]), + }, cache_dirpath=config.env.cache_dirpath, - save_output=save_output, load_saved_output=load_saved_output) + force_fitting=force_fitting, + cache_output=cache_output) - mask_resize = Step(name='mask_resize', + return unet_multitask + + +def preprocessing(config, model_type, is_train, loader_mode=None): + if config.execution.load_in_memory: + if model_type == 'single': + loader = _preprocessing_single_in_memory(config, is_train, loader_mode) + elif model_type == 'multitask': + loader = _preprocessing_multitask_in_memory(config, is_train, loader_mode) + else: + raise NotImplementedError + else: + if model_type == 'single': + loader = _preprocessing_single_generator(config, is_train, loader_mode) + elif model_type == 'multitask': + loader = _preprocessing_multitask_generator(config, is_train, loader_mode) + else: + raise NotImplementedError + return loader + + +def postprocessing(model_mask, model_contour, config, suffix='', save_output=False, cache_output=False): + mask_resize = Step(name='mask_resize{}'.format(suffix), transformer=Resizer(), input_data=['input'], - input_steps=[unet_multitask], - adapter={'images': ([(unet_multitask.name, 'mask_prediction')]), + input_steps=[model_mask], + adapter={'images': ([(model_mask.name, 'mask_prediction')]), 'target_sizes': ([('input', 'target_sizes')]), }, cache_dirpath=config.env.cache_dirpath, - save_output=save_output) + save_output=save_output, + cache_output=cache_output) - contour_resize = Step(name='contour_resize', + contour_resize = Step(name='contour_resize{}'.format(suffix), transformer=Resizer(), input_data=['input'], - input_steps=[unet_multitask], - adapter={'images': ([(unet_multitask.name, 'contour_prediction')]), + input_steps=[model_contour], + adapter={'images': ([(model_contour.name, 'contour_prediction')]), 'target_sizes': ([('input', 'target_sizes')]), }, cache_dirpath=config.env.cache_dirpath, - save_output=save_output) + save_output=save_output, + cache_output=cache_output) - detached = Step(name='detached', - transformer=Postprocessor(), - input_steps=[mask_resize, contour_resize], - adapter={'images': ([(mask_resize.name, 'resized_images')]), - 'contours': ([(contour_resize.name, 'resized_images')]), - }, - cache_dirpath=config.env.cache_dirpath, - save_output=save_output) + morphological_postprocessing = Step(name='morphological_postprocessing{}'.format(suffix), + transformer=Postprocessor(), + input_steps=[mask_resize, contour_resize], + adapter={'images': ([(mask_resize.name, 'resized_images')]), + 'contours': ([(contour_resize.name, 'resized_images')]), + }, + cache_dirpath=config.env.cache_dirpath, + save_output=save_output, + cache_output=cache_output) - output = Step(name='output', - transformer=Dummy(), - input_steps=[detached], - adapter={'y_pred': ([(detached.name, 'labeled_images')]), - }, - cache_dirpath=config.env.cache_dirpath) - return output + return morphological_postprocessing -def preprocessing_train(config): - if config.execution.load_in_memory: - reader_train = Step(name='reader_train', - transformer=ImageReader(**config.reader_single), - input_data=['input'], - adapter={'meta': ([('input', 'meta')]), - 'train_mode': ([('input', 'train_mode')]), - }, - cache_dirpath=config.env.cache_dirpath) +def nuclei_labeler(postprocessed_mask, config, save_output=True): + labeler = Step(name='labeler', + transformer=NucleiLabeler(), + input_steps=[postprocessed_mask], + adapter={'images': ([(postprocessed_mask.name, 'binarized_images')]), + }, + cache_dirpath=config.env.cache_dirpath, + save_output=save_output) + return labeler - reader_inference = Step(name='reader_inference', - transformer=ImageReader(**config.reader_single), - input_data=['input'], - adapter={'meta': ([('input', 'meta_valid')]), - 'train_mode': ([('input', 'train_mode')]), - }, - cache_dirpath=config.env.cache_dirpath) - loader = Step(name='loader', - transformer=ImageSegmentationLoader(**config.loader), +def _preprocessing_single_in_memory(config, is_train, use_patching): + if use_patching: + raise NotImplementedError + else: + reader = Step(name='reader', + transformer=ImageReader(**config.reader_single), input_data=['input'], - input_steps=[reader_train, reader_inference], - adapter={'X': ([('reader_train', 'X')]), - 'y': ([('reader_train', 'y')]), + adapter={'meta': ([('input', 'meta')]), + 'meta_valid': ([('input', 'meta_valid')]), 'train_mode': ([('input', 'train_mode')]), - 'X_valid': ([('reader_inference', 'X')]), - 'y_valid': ([('reader_inference', 'y')]), }, cache_dirpath=config.env.cache_dirpath) - else: - xy_train = Step(name='xy_train', - transformer=XYSplit(**config.xy_splitter), - input_data=['input'], - adapter={'meta': ([('input', 'meta')]), - 'train_mode': ([('input', 'train_mode')]) - }, - cache_dirpath=config.env.cache_dirpath) - - xy_inference = Step(name='xy_inference', - transformer=XYSplit(**config.xy_splitter), - input_data=['input'], - adapter={'meta': ([('input', 'meta_valid')]), - 'train_mode': ([('input', 'train_mode')]) - }, - cache_dirpath=config.env.cache_dirpath) loader = Step(name='loader', - transformer=MetadataImageSegmentationLoader(**config.loader), + transformer=loaders.ImageSegmentationLoader(**config.loader), input_data=['input'], - input_steps=[xy_train, xy_inference], - adapter={'X': ([('xy_train', 'X')], squeeze_inputs), - 'y': ([('xy_train', 'y')], squeeze_inputs), + input_steps=[reader], + adapter={'X': ([('reader', 'X')]), + 'y': ([('reader', 'y')]), 'train_mode': ([('input', 'train_mode')]), - 'X_valid': ([('xy_inference', 'X')], squeeze_inputs), - 'y_valid': ([('xy_inference', 'y')], squeeze_inputs), + 'X_valid': ([('reader', 'X_valid')]), + 'y_valid': ([('reader', 'y_valid')]), }, cache_dirpath=config.env.cache_dirpath) return loader -def preprocessing_inference(config): - if config.execution.load_in_memory: - reader_inference = Step(name='reader_inference', - transformer=ImageReader(**config.reader_single), - input_data=['input'], - adapter={'meta': ([('input', 'meta')]), - 'train_mode': ([('input', 'train_mode')]), - }, - cache_dirpath=config.env.cache_dirpath) - - loader = Step(name='loader', - transformer=ImageSegmentationLoader(**config.loader), - input_data=['input'], - input_steps=[reader_inference], - adapter={'X': ([('reader_inference', 'X')]), - 'y': ([('reader_inference', 'y')]), - 'train_mode': ([('input', 'train_mode')]), - }, - cache_dirpath=config.env.cache_dirpath) +def _preprocessing_single_generator(config, is_train, use_patching): + if use_patching: + raise NotImplementedError else: - xy_inference = Step(name='xy_inference', + if is_train: + xy_train = Step(name='xy_train', transformer=XYSplit(**config.xy_splitter), input_data=['input'], adapter={'meta': ([('input', 'meta')]), @@ -181,118 +506,128 @@ def preprocessing_inference(config): }, cache_dirpath=config.env.cache_dirpath) - loader = Step(name='loader', - transformer=MetadataImageSegmentationLoader(**config.loader), - input_data=['input'], - input_steps=[xy_inference, xy_inference], - adapter={'X': ([('xy_inference', 'X')], squeeze_inputs), - 'y': ([('xy_inference', 'y')], squeeze_inputs), - 'train_mode': ([('input', 'train_mode')]), - }, - cache_dirpath=config.env.cache_dirpath) - return loader - - -def preprocessing_multitask_train(config): - if config.execution.load_in_memory: - reader_train = Step(name='reader_train', - transformer=ImageReader(**config.reader_multitask), - input_data=['input'], - adapter={'meta': ([('input', 'meta')]), - 'train_mode': ([('input', 'train_mode')]), - }, - cache_dirpath=config.env.cache_dirpath, - save_output=True, load_saved_output=True) - - reader_inference = Step(name='reader_inference', - transformer=ImageReader(**config.reader_multitask), + xy_inference = Step(name='xy_inference', + transformer=XYSplit(**config.xy_splitter), input_data=['input'], adapter={'meta': ([('input', 'meta_valid')]), - 'train_mode': ([('input', 'train_mode')]), + 'train_mode': ([('input', 'train_mode')]) }, - cache_dirpath=config.env.cache_dirpath, - save_output=True, load_saved_output=True) + cache_dirpath=config.env.cache_dirpath) - loader = Step(name='loader', - transformer=ImageSegmentationMultitaskLoader(**config.loader), - input_data=['input'], - input_steps=[reader_train, reader_inference], - adapter={'X': ([('reader_train', 'X')]), - 'y': ([('reader_train', 'y')]), - 'train_mode': ([('input', 'train_mode')]), - 'X_valid': ([('reader_inference', 'X')]), - 'y_valid': ([('reader_inference', 'y')]), - }, - cache_dirpath=config.env.cache_dirpath) + loader = Step(name='loader', + transformer=loaders.MetadataImageSegmentationLoader(**config.loader), + input_data=['input'], + input_steps=[xy_train, xy_inference], + adapter={'X': ([('xy_train', 'X')], squeeze_inputs), + 'y': ([('xy_train', 'y')], squeeze_inputs), + 'train_mode': ([('input', 'train_mode')]), + 'X_valid': ([('xy_inference', 'X')], squeeze_inputs), + 'y_valid': ([('xy_inference', 'y')], squeeze_inputs), + }, + cache_dirpath=config.env.cache_dirpath) + else: + xy_inference = Step(name='xy_inference', + transformer=XYSplit(**config.xy_splitter), + input_data=['input'], + adapter={'meta': ([('input', 'meta')]), + 'train_mode': ([('input', 'train_mode')]) + }, + cache_dirpath=config.env.cache_dirpath) + + loader = Step(name='loader', + transformer=loaders.MetadataImageSegmentationLoader(**config.loader), + input_data=['input'], + input_steps=[xy_inference, xy_inference], + adapter={'X': ([('xy_inference', 'X')], squeeze_inputs), + 'y': ([('xy_inference', 'y')], squeeze_inputs), + 'train_mode': ([('input', 'train_mode')]), + }, + cache_dirpath=config.env.cache_dirpath) + return loader + + +def _preprocessing_multitask_in_memory(config, is_train, loader_mode): + if loader_mode == 'patching_train': + Loader = loaders.ImageSegmentationMultitaskLoaderPatchingTrain + elif loader_mode == 'patching_inference': + Loader = loaders.ImageSegmentationMultitaskLoaderPatchingInference else: - xy_train = Step(name='xy_train', - transformer=XYSplit(**config.xy_splitter_multitask), - input_data=['input'], - adapter={'meta': ([('input', 'meta')]), - 'train_mode': ([('input', 'train_mode')]) - }, - cache_dirpath=config.env.cache_dirpath) + Loader = loaders.ImageSegmentationMultitaskLoader + + reader = Step(name='reader', + transformer=ImageReader(**config.reader_multitask), + input_data=['input'], + adapter={'meta': ([('input', 'meta')]), + 'meta_valid': ([('input', 'meta_valid')]), + 'train_mode': ([('input', 'train_mode')]), + }, + cache_dirpath=config.env.cache_dirpath, + save_output=False, load_saved_output=False) + + loader = Step(name='loader', + transformer=Loader(**config.loader), + input_data=['input'], + input_steps=[reader], + adapter={'X': ([('reader', 'X')]), + 'y': ([('reader', 'y')]), + 'train_mode': ([('input', 'train_mode')]), + 'X_valid': ([('reader', 'X_valid')]), + 'y_valid': ([('reader', 'y_valid')]), + }, + cache_dirpath=config.env.cache_dirpath) + return loader + - xy_inference = Step(name='xy_inference', +def _preprocessing_multitask_generator(config, is_train, use_patching): + if use_patching: + raise NotImplementedError + else: + if is_train: + xy_train = Step(name='xy_train', transformer=XYSplit(**config.xy_splitter_multitask), input_data=['input'], - adapter={'meta': ([('input', 'meta_valid')]), + adapter={'meta': ([('input', 'meta')]), 'train_mode': ([('input', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) - loader = Step(name='loader', - transformer=MetadataImageSegmentationMultitaskLoader(**config.loader), - input_data=['input'], - input_steps=[xy_train, xy_inference], - adapter={'X': ([('xy_train', 'X')], squeeze_inputs), - 'y': ([('xy_train', 'y')]), - 'train_mode': ([('input', 'train_mode')]), - 'X_valid': ([('xy_inference', 'X')], squeeze_inputs), - 'y_valid': ([('xy_inference', 'y')]), - }, - cache_dirpath=config.env.cache_dirpath) - - return loader - + xy_inference = Step(name='xy_inference', + transformer=XYSplit(**config.splitter_config), + input_data=['input'], + adapter={'meta': ([('input', 'meta_valid')]), + 'train_mode': ([('input', 'train_mode')]) + }, + cache_dirpath=config.env.cache_dirpath) -def preprocessing_multitask_inference(config): - if config.execution.load_in_memory: - reader_inference = Step(name='reader_inference', - transformer=ImageReader(**config.reader_multitask), + loader = Step(name='loader', + transformer=loaders.MetadataImageSegmentationMultitaskLoader(**config.loader), + input_data=['input'], + input_steps=[xy_train, xy_inference], + adapter={'X': ([('xy_train', 'X')], squeeze_inputs), + 'y': ([('xy_train', 'y')]), + 'train_mode': ([('input', 'train_mode')]), + 'X_valid': ([('xy_inference', 'X')], squeeze_inputs), + 'y_valid': ([('xy_inference', 'y')]), + }, + cache_dirpath=config.env.cache_dirpath) + else: + xy_inference = Step(name='xy_inference', + transformer=XYSplit(**config.xy_splitter_multitask), input_data=['input'], adapter={'meta': ([('input', 'meta')]), - 'train_mode': ([('input', 'train_mode')]), + 'train_mode': ([('input', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) - loader = Step(name='loader', - transformer=ImageSegmentationMultitaskLoader(**config.loader), - input_data=['input'], - input_steps=[reader_inference], - adapter={'X': ([('reader_inference', 'X')]), - 'y': ([('reader_inference', 'y')]), - 'train_mode': ([('input', 'train_mode')]), - }, - cache_dirpath=config.env.cache_dirpath) - else: - xy_inference = Step(name='xy_inference', - transformer=XYSplit(**config.xy_splitter), - input_data=['input'], - adapter={'meta': ([('input', 'meta')]), - 'train_mode': ([('input', 'train_mode')]) - }, - cache_dirpath=config.env.cache_dirpath) - - loader = Step(name='loader', - transformer=MetadataImageSegmentationMultitaskLoader(**config.loader), - input_data=['input'], - input_steps=[xy_inference, xy_inference], - adapter={'X': ([('xy_inference', 'X')], squeeze_inputs), - 'y': ([('xy_inference', 'y')], squeeze_inputs), - 'train_mode': ([('input', 'train_mode')]), - }, - cache_dirpath=config.env.cache_dirpath) + loader = Step(name='loader', + transformer=loaders.MetadataImageSegmentationMultitaskLoader(**config.loader), + input_data=['input'], + input_steps=[xy_inference, xy_inference], + adapter={'X': ([('xy_inference', 'X')], squeeze_inputs), + 'y': ([('xy_inference', 'y')], squeeze_inputs), + 'train_mode': ([('input', 'train_mode')]), + }, + cache_dirpath=config.env.cache_dirpath) return loader @@ -306,7 +641,6 @@ def mask_postprocessing(model, config, save_output=True): }, cache_dirpath=config.env.cache_dirpath, save_output=save_output) - mask_thresholding = Step(name='mask_thresholding', transformer=Thresholder(**config.thresholder), input_steps=[mask_resize], @@ -314,111 +648,42 @@ def mask_postprocessing(model, config, save_output=True): }, cache_dirpath=config.env.cache_dirpath, save_output=save_output) - return mask_thresholding -def contour_postprocessing(model, config, save_output=True): - contour_resize = Step(name='contour_resize', - transformer=Resizer(), - input_data=['input'], - input_steps=[model], - adapter={'images': ([(model.name, 'contour_prediction')]), - 'target_sizes': ([('input', 'target_sizes')]), - }, - cache_dirpath=config.env.cache_dirpath, - save_output=save_output) - - contour_thresholding = Step(name='contour_thresholding', - transformer=Thresholder(**config.thresholder), - input_steps=[contour_resize], - adapter={'images': ([('contour_resize', 'resized_images')]), - }, - cache_dirpath=config.env.cache_dirpath, - save_output=save_output) - return contour_thresholding - - -def center_postprocessing(model, config, save_output=True): - center_resize = Step(name='center_resize', - transformer=Resizer(), - input_data=['input'], - input_steps=[model], - adapter={'images': ([(model.name, 'center_prediction')]), - 'target_sizes': ([('input', 'target_sizes')]), - }, - cache_dirpath=config.env.cache_dirpath, - save_output=save_output) - - center_thresholding = Step(name='center_thresholding', - transformer=Thresholder(**config.thresholder), - input_steps=[center_resize], - adapter={'images': ([('center_resize', 'resized_images')]), - }, - cache_dirpath=config.env.cache_dirpath, - save_output=save_output) - return center_thresholding - - -def watershed_centers(mask, center, config, save_output=True): - watershed_center = Step(name='watershed_centers', - transformer=WatershedCenter(), - input_steps=[mask, center], - adapter={'images': ([(mask.name, 'binarized_images')]), - 'contours': ([(center.name, 'binarized_images')]), - }, - cache_dirpath=config.env.cache_dirpath, - save_output=save_output) - - drop_smaller = Step(name='drop_smaller', - transformer=Dropper(**config.dropper), - input_steps=[watershed_center], - adapter={'labels': ([('watershed_center', 'detached_images')]), +def postpro_dev(config): + reader_train = Step(name='reader', + transformer=ImageReader(**config.reader_multitask), + input_data=['input'], + adapter={'meta': ([('input', 'meta')]), + 'train_mode': ([('input', 'train_mode')]), }, - cache_dirpath=config.env.cache_dirpath, - save_output=save_output) - - binary_fill = Step(name='binary_fill', - transformer=BinaryFillHoles(), - input_steps=[drop_smaller], - adapter={'images': ([('drop_smaller', 'labels')]), - }, - cache_dirpath=config.env.cache_dirpath, - save_output=save_output) - - return binary_fill - - -def watershed_contours(mask, contour, config, save_output=True): - watershed_contour = Step(name='watershed_contour', - transformer=WatershedContour(), - input_steps=[mask, contour], - adapter={'images': ([(mask.name, 'binarized_images')]), - 'contours': ([(contour.name, 'binarized_images')]), - }, - cache_dirpath=config.env.cache_dirpath, - save_output=save_output) + cache_dirpath=config.env.cache_dirpath) + deconved_reader_train = add_stain_deconvolution(reader_train, config, cache_output=True, save_output=False, + suffix='') - drop_smaller = Step(name='drop_smaller', - transformer=Dropper(**config.dropper), - input_steps=[watershed_contour], - adapter={'labels': ([('watershed_contour', 'detached_images')]), - }, + reader = Step(name='reader_joined', + transformer=Dummy(), + input_steps=[deconved_reader_train], + adapter={'X': ([(deconved_reader_train.name, 'X')]), + 'y': ([(deconved_reader_train.name, 'y')]), + }, + cache_dirpath=config.env.cache_dirpath, + save_output=True, load_saved_output=False) - cache_dirpath=config.env.cache_dirpath, - save_output=save_output) - return drop_smaller + unet_multitask = unet_multitask_block(reader, config, config.unet_size_estimator, + loader_mode=None, suffix='_size_estimator', train_mode=False) + morphological_postprocessing = postprocessing(unet_multitask, unet_multitask, config, + suffix='', save_output=True) -def nuclei_labeler(postprocessed_mask, config, save_output=True): - labeler = Step(name='labeler', - transformer=NucleiLabeler(), - input_steps=[postprocessed_mask], - adapter={'images': ([(postprocessed_mask.name, 'binarized_images')]), - }, - cache_dirpath=config.env.cache_dirpath, - save_output=save_output) - return labeler + output = Step(name='output', + transformer=Dummy(), + input_steps=[morphological_postprocessing], + adapter={'y_pred': ([(morphological_postprocessing.name, 'labeled_images')]), + }, + cache_dirpath=config.env.cache_dirpath) + return output PIPELINES = {'unet': {'train': partial(unet, train_mode=True), @@ -426,5 +691,11 @@ def nuclei_labeler(postprocessed_mask, config, save_output=True): }, 'unet_multitask': {'train': partial(unet_multitask, train_mode=True), 'inference': partial(unet_multitask, train_mode=False), - } + }, + + 'patched_unet_training': {'train': patched_unet_training}, + 'postpro_dev':{'inference': postpro_dev}, + 'scale_adjusted_patched_unet_training': {'train': scale_adjusted_patched_unet_training}, + 'scale_adjusted_patched_unet': {'train': scale_adjusted_patched_unet, + 'inference': scale_adjusted_patched_unet} } diff --git a/postprocessing.py b/postprocessing.py index 02d2b31..cefc23d 100644 --- a/postprocessing.py +++ b/postprocessing.py @@ -2,8 +2,8 @@ import skimage.morphology as morph from scipy import ndimage as ndi from scipy.stats import itemfreq -from skimage.transform import resize from skimage.filters import threshold_otsu +from skimage.transform import resize from sklearn.externals import joblib from tqdm import tqdm @@ -19,12 +19,6 @@ def transform(self, images, target_sizes): resized_images.append(resized_image) return {'resized_images': resized_images} - def load(self, filepath): - return self - - def save(self, filepath): - joblib.dump({}, filepath) - class Thresholder(BaseTransformer): def __init__(self, threshold): @@ -37,12 +31,6 @@ def transform(self, images): binarized_images.append(binarized_image) return {'binarized_images': binarized_images} - def load(self, filepath): - return self - - def save(self, filepath): - joblib.dump({}, filepath) - class WatershedCenter(BaseTransformer): def transform(self, images, centers): @@ -52,12 +40,6 @@ def transform(self, images, centers): detached_images.append(detached_image) return {'detached_images': detached_images} - def load(self, filepath): - return self - - def save(self, filepath): - joblib.dump({}, filepath) - class WatershedContour(BaseTransformer): def transform(self, images, contours): @@ -67,12 +49,6 @@ def transform(self, images, contours): detached_images.append(detached_image) return {'detached_images': detached_images} - def load(self, filepath): - return self - - def save(self, filepath): - joblib.dump({}, filepath) - class BinaryFillHoles(BaseTransformer): def transform(self, images): @@ -82,12 +58,6 @@ def transform(self, images): filled_images.append(filled_image) return {'filled_images': filled_images} - def load(self, filepath): - return self - - def save(self, filepath): - joblib.dump({}, filepath) - class Dropper(BaseTransformer): def __init__(self, min_size): @@ -101,12 +71,6 @@ def transform(self, labels): return {'labels': labeled_images} - def load(self, filepath): - return self - - def save(self, filepath): - joblib.dump({}, filepath) - class NucleiLabeler(BaseTransformer): def transform(self, images): @@ -117,12 +81,6 @@ def transform(self, images): return {'labeled_images': labeled_images} - def load(self, filepath): - return self - - def save(self, filepath): - joblib.dump({}, filepath) - class Postprocessor(BaseTransformer): def __init__(self, **kwargs): @@ -135,11 +93,17 @@ def transform(self, images, contours): labeled_images.append(labeled_image) return {'labeled_images': labeled_images} - def load(self, filepath): - return self - def save(self, filepath): - joblib.dump({}, filepath) +class CellSizer(BaseTransformer): + def __init__(self, **kwargs): + pass + + def transform(self, labeled_images): + mean_sizes = [] + for image in tqdm(labeled_images): + mean_size = mean_cell_size(image) + mean_sizes.append(mean_size) + return {'sizes': mean_sizes} def watershed_center(image, center): @@ -167,7 +131,7 @@ def watershed_contour(image, contour): def postprocess(image, contour): - cleaned_mask = clean_mask(image, contour) + cleaned_mask = get_clean_mask(image, contour) good_markers = get_markers(cleaned_mask, contour) good_distance = get_distance(cleaned_mask) @@ -175,14 +139,15 @@ def postprocess(image, contour): labels = add_dropped_water_blobs(labels, cleaned_mask) - m_thresh = threshold_otsu(image) - initial_mask_binary = (image > m_thresh).astype(np.uint8) - labels = drop_artifacts_per_label(labels, initial_mask_binary) + min_joinable_size = min_blob_size(image > 0.5, percentile=50, fraction_of_percentile=0.2) + labels = connect_small(labels, min_cell_size=min_joinable_size) + + min_acceptable_size = min_blob_size(image > 0.5, percentile=50, fraction_of_percentile=0.1) + labels = drop_small(labels, min_size=min_acceptable_size) - labels = drop_small(labels, min_size=20) - labels = fill_holes_per_blob(labels) + labels = drop_big_artifacts(labels, scale=0.01) - return labels + return relabel(labels) def drop_artifacts_per_label(labels, initial_mask): @@ -195,59 +160,64 @@ def drop_artifacts_per_label(labels, initial_mask): return labels_cleaned -def clean_mask(m, c): - # threshold - m_thresh = threshold_otsu(m) - c_thresh = threshold_otsu(c) - m_b = m > m_thresh - c_b = c > c_thresh - - # combine contours and masks and fill the cells +def get_clean_mask(m, c): + m_b = m > 0.5 + c_b = c > 0.5 m_ = np.where(m_b | c_b, 1, 0) - m_ = ndi.binary_fill_holes(m_) - - # close what wasn't closed before - area, radius = mean_blob_size(m_b) - struct_size = int(1.25 * radius) - struct_el = morph.disk(struct_size) - m_padded = pad_mask(m_, pad=struct_size) - m_padded = morph.binary_closing(m_padded, selem=struct_el) - m_ = crop_mask(m_padded, crop=struct_size) - # open to cut the real cells from the artifacts - area, radius = mean_blob_size(m_b) - struct_size = int(0.75 * radius) - struct_el = morph.disk(struct_size) - m_ = np.where(c_b & (~m_b), 0, m_) - m_padded = pad_mask(m_, pad=struct_size) - m_padded = morph.binary_opening(m_padded, selem=struct_el) - m_ = crop_mask(m_padded, crop=struct_size) + clean_mask = np.zeros_like(m) + labels, label_nr = ndi.label(m_) + for label in range(1, label_nr + 1): + mask_component = np.where(labels == label, m_b, 0) + contour_component = np.where(labels == label, c_b, 0) - # join the connected cells with what we had at the beginning - m_ = np.where(m_b | m_, 1, 0) - m_ = ndi.binary_fill_holes(m_) + component_radius = np.sqrt(mask_component.sum()) + struct_size = int(max(0.05 * component_radius, 5)) + struct_el = morph.disk(struct_size) + m_padded = pad_mask(mask_component, pad=struct_size) + m_padded = morph.binary_closing(m_padded, selem=struct_el) + m_padded = morph.binary_opening(m_padded, selem=struct_el) + mask_component_ = crop_mask(m_padded, crop=struct_size) + mask_component_ = ndi.binary_fill_holes(mask_component_) - # drop all the cells that weren't present at least in 25% of area in the initial mask - m_ = drop_artifacts(m_, m_b, min_coverage=0.25) + mask_component_ = np.where(mask_component_ | mask_component | contour_component, 1, 0) + clean_mask += mask_component_ - return m_ + clean_mask = np.where(clean_mask, 1, 0) + return clean_mask def get_markers(m_b, c): - # threshold - c_thresh = threshold_otsu(c) - c_b = c > c_thresh + c_b = c > 0.75 + marker_component = np.where(m_b & ~c_b, 1, 0) + min_size = min_blob_size(m_b, percentile=50, fraction_of_percentile=0.2) + labels, label_nr = ndi.label(marker_component) + markers = np.zeros_like(marker_component) + for label in range(1, label_nr + 1): + mask_component = np.where(labels == label, 1, 0) + + if mask_component.sum() < min_size: + continue + + mask_component = ndi.binary_fill_holes(mask_component) + + if mask_component.sum() < min_size * 3: + markers += np.where(mask_component, 1, 0) + continue + + component_radius = np.sqrt(mask_component.sum()) + struct_size = int(component_radius * 0.15) + struct_el = morph.disk(struct_size) + m_padded = pad_mask(mask_component, pad=struct_size) + m_padded = morph.binary_erosion(m_padded, selem=struct_el) + mask_component = crop_mask(m_padded, crop=struct_size) - mk_ = np.where(c_b, 0, m_b) + mask_component_labels, _ = ndi.label(mask_component) + mask_component = drop_small(mask_component_labels, min_size) + markers += np.where(mask_component, 1, 0) - area, radius = mean_blob_size(m_b) - struct_size = int(0.25 * radius) - struct_el = morph.disk(struct_size) - m_padded = pad_mask(mk_, pad=struct_size) - m_padded = morph.erosion(m_padded, selem=struct_el) - mk_ = crop_mask(m_padded, crop=struct_size) - mk_, _ = ndi.label(mk_) - return mk_ + markers, _ = ndi.label(markers) + return markers def get_distance(m_b): @@ -295,8 +265,10 @@ def mean_blob_size(mask): mean_area = 1 mean_radius = 1 else: - mean_area = int(itemfreq(labels)[1:, 1].mean()) - mean_radius = int(np.round(np.sqrt(mean_area / np.pi))) + blob_sizes = itemfreq(labels) + blob_sizes = blob_sizes[blob_sizes[:, 0].argsort()][1:, :] + mean_area = int(blob_sizes.mean()) + mean_radius = int(np.round(np.sqrt(mean_area) / np.pi)) return mean_area, mean_radius @@ -308,9 +280,9 @@ def pad_mask(mask, pad): w_pad = w + 2 * pad mask_padded = np.zeros((h_pad, w_pad)) mask_padded[pad:pad + h, pad:pad + w] = mask - mask_padded[pad - 1, :] = 1 + mask_padded[pad, :] = 1 mask_padded[pad + h + 1, :] = 1 - mask_padded[:, pad - 1] = 1 + mask_padded[:, pad] = 1 mask_padded[:, pad + w + 1] = 1 return mask_padded @@ -332,3 +304,98 @@ def drop_small(img, min_size): def label(mask): labeled, nr_true = ndi.label(mask) return labeled + + +def min_blob_size(mask, percentile=25, fraction_of_percentile=0.1): + labels, labels_nr = ndi.label(mask) + if labels_nr < 2: + return 0 + else: + blob_sizes = itemfreq(labels) + blob_sizes = blob_sizes[blob_sizes[:, 0].argsort()][1:, 1] + return fraction_of_percentile * np.percentile(blob_sizes, percentile) + + +def mean_cell_size(labeled_image): + blob_sizes = itemfreq(labeled_image) + if blob_sizes.shape[0] == 1: + return 0 + else: + blob_sizes = blob_sizes[blob_sizes[:, 0].argsort()][1:, 1] + return np.mean(blob_sizes) + + +def find_touching_labels(labels, label_id): + mask = np.where(labels == label_id, 0, 1) + dist = ndi.distance_transform_edt(mask) + neighbour_labels = np.unique(np.where(dist == 1.0, labels, 0)).tolist() + neighbour_labels.remove(0) + + neighbour_labels_with_sizes = [(neighbor_label, np.where(labels == neighbor_label, 1, 0).sum()) + for neighbor_label in neighbour_labels] + neighbour_labels_with_sizes = sorted(neighbour_labels_with_sizes, + key=lambda x: x[1], + reverse=False) + neighbour_labels_sorted = [neighbor_label for neighbor_label, _ in neighbour_labels_with_sizes] + neighbour_labels_sorted + return neighbour_labels_sorted + + +def connect_small(labels, min_cell_size=None): + labels_with_sizes = [(label_id, np.where(labels == label_id, 1, 0).sum()) + for label_id in range(1, labels.max() + 1)] + label_ids_sorted_by_size = [lws[0] for lws in sorted(labels_with_sizes, + key=lambda x: x[1], + reverse=False)] + touching_cell_was_connected = False + for label_id in label_ids_sorted_by_size: + cell_size = np.sum(labels == label_id) + touching_labels = find_touching_labels(labels, label_id) + for touching_label in touching_labels: + touching_cell_mask = np.where(labels == touching_label, 1, 0) + touching_cell_size = np.sum(touching_cell_mask) + if touching_cell_size < min_cell_size: + labels = np.where(labels == touching_label, label_id, labels) + touching_cell_was_connected = True + labels = relabel(labels) + if touching_cell_was_connected: + labels = connect_small(labels, min_cell_size) + return relabel(labels) + + +def is_slim(im, object_ar, area_ar): + ind = np.where(im == 1) + ydiff = np.max(ind[0]) - np.min(ind[0]) + xdiff = np.max(ind[1]) - np.min(ind[1]) + rec_area = xdiff * ydiff + area = np.sum(im == 1) + if xdiff / ydiff < object_ar and xdiff / ydiff > 1.0 / object_ar and area / rec_area > area_ar: + return False + return True + + +def touching_edges(im, margin): + indices = np.where(im == 1) + edges = [] + edges.append(np.sum(indices[0] <= margin)) + edges.append(np.sum(indices[1] <= margin)) + edges.append(np.sum(indices[0] >= im.shape[0] - 1 - margin)) + edges.append(np.sum(indices[1] >= im.shape[1] - 1 - margin)) + return np.sum(np.array(edges) > 0) + + +def drop_big_artifacts(im, scale): + im_cleaned = np.copy(im) + im_size = im.shape[0] * im.shape[1] + for label in np.unique(im): + if label == 0: + continue + size = np.sum(im == label) + if size < scale * im_size: + continue + if not is_slim(im == label, 2, 0.5): + continue + if touching_edges(im=im == label, margin=2) < 2: + continue + im_cleaned[im_cleaned == label] = 0 + return im_cleaned diff --git a/preparation.py b/preparation.py index 867f881..d9c8c53 100644 --- a/preparation.py +++ b/preparation.py @@ -3,24 +3,39 @@ import cv2 import numpy as np +import pandas as pd import scipy.ndimage as ndi import torch from PIL import Image from imageio import imwrite from skimage.transform import resize from sklearn.cluster import KMeans +from sklearn.model_selection import train_test_split from torchvision import models from tqdm import tqdm -def train_valid_split(meta, validation_size, valid_category_ids=None): +def train_valid_split(meta, validation_size, valid_category_ids=None, simple_split=False): meta_train = meta[meta['is_train'] == 1] - meta_train_split, meta_valid_split = split_on_column(meta_train, - column='vgg_features_clusters', - test_size=validation_size, - random_state=1234, - valid_category_ids=valid_category_ids - ) + + if simple_split: + meta_train_splittable = meta_train[meta_train['is_external'] == 0] + external_data = meta_train[meta_train['is_external'] == 1] + meta_train_split, meta_valid_split = train_test_split(meta_train_splittable, + test_size=validation_size, + random_state=1234) + meta_train_split = pd.concat([meta_train_split, external_data], axis=0).sample(frac=1, random_state=1234) + else: + meta_train_splittable = meta_train[meta_train['vgg_features_clusters'] != -1] + external_data = meta_train[meta_train['vgg_features_clusters'] == -1] + + meta_train_split, meta_valid_split = split_on_column(meta_train_splittable, + column='vgg_features_clusters', + test_size=validation_size, + random_state=1234, + valid_category_ids=valid_category_ids + ) + meta_train_split = pd.concat([meta_train_split, external_data], axis=0).sample(frac=1, random_state=1234) return meta_train_split, meta_valid_split @@ -28,8 +43,7 @@ def split_on_column(meta, column, test_size, random_state=1, valid_category_ids= if valid_category_ids is None: categories = meta[column].unique() np.random.seed(random_state) - valid_category_ids = np.random.choice(categories, - int(test_size * len(categories))) + valid_category_ids = np.random.choice(categories, int(test_size * len(categories))) valid = meta[meta[column].isin(valid_category_ids)].sample(frac=1, random_state=random_state) train = meta[~(meta[column].isin(valid_category_ids))].sample(frac=1, random_state=random_state) return train, valid @@ -38,29 +52,16 @@ def split_on_column(meta, column, test_size, random_state=1, valid_category_ids= def overlay_masks(images_dir, subdir_name, target_dir): train_dir = os.path.join(images_dir, subdir_name) for mask_dirname in tqdm(glob.glob('{}/*/masks'.format(train_dir))): - masks = [] - for image_filepath in glob.glob('{}/*'.format(mask_dirname)): - image = np.asarray(Image.open(image_filepath)) - image = image / 255.0 - masks.append(image) - overlayed_masks = np.sum(masks, axis=0) + overlayed_masks = overlay_masks_from_dir(mask_dirname) target_filepath = '/'.join(mask_dirname.replace(images_dir, target_dir).split('/')[:-1]) + '.png' os.makedirs(os.path.dirname(target_filepath), exist_ok=True) imwrite(target_filepath, overlayed_masks) -def overlay_contours(images_dir, subdir_name, target_dir, touching_only=False): +def overlay_contours(images_dir, subdir_name, target_dir): train_dir = os.path.join(images_dir, subdir_name) for mask_dirname in tqdm(glob.glob('{}/*/masks'.format(train_dir))): - masks = [] - for image_filepath in glob.glob('{}/*'.format(mask_dirname)): - image = np.asarray(Image.open(image_filepath)) - image = image / 255.0 - masks.append(get_contour(image)) - if touching_only: - overlayed_masks = np.where(np.sum(masks, axis=0) > 128. + 255., 255., 0.).astype(np.uint8) - else: - overlayed_masks = np.where(np.sum(masks, axis=0) > 128., 255., 0.).astype(np.uint8) + overlayed_masks = overlay_contours_from_dir(mask_dirname) target_filepath = '/'.join(mask_dirname.replace(images_dir, target_dir).split('/')[:-1]) + '.png' os.makedirs(os.path.dirname(target_filepath), exist_ok=True) imwrite(target_filepath, overlayed_masks) @@ -69,29 +70,59 @@ def overlay_contours(images_dir, subdir_name, target_dir, touching_only=False): def overlay_centers(images_dir, subdir_name, target_dir): train_dir = os.path.join(images_dir, subdir_name) for mask_dirname in tqdm(glob.glob('{}/*/masks'.format(train_dir))): - masks = [] - for image_filepath in glob.glob('{}/*'.format(mask_dirname)): - image = np.asarray(Image.open(image_filepath)) - image = image / 255.0 - masks.append(get_center(image)) - overlayed_masks = np.where(np.sum(masks, axis=0) > 128., 255., 0.).astype(np.uint8) + overlayed_masks = overlay_centers_from_dir(mask_dirname) target_filepath = '/'.join(mask_dirname.replace(images_dir, target_dir).split('/')[:-1]) + '.png' os.makedirs(os.path.dirname(target_filepath), exist_ok=True) imwrite(target_filepath, overlayed_masks) +def overlay_contours_from_dir(mask_dirname): + masks = [] + for image_filepath in glob.glob('{}/*'.format(mask_dirname)): + image = np.asarray(Image.open(image_filepath)) + image = ndi.binary_fill_holes(image) + contour = get_contour(image) + inside_contour = np.where(image & contour, 255, 0) + masks.append(inside_contour) + overlayed_masks = np.where(np.sum(masks, axis=0) > 128., 255., 0.).astype(np.uint8) + return overlayed_masks + + +def overlay_masks_from_dir(mask_dirname): + masks = [] + for image_filepath in glob.glob('{}/*'.format(mask_dirname)): + image = np.asarray(Image.open(image_filepath)) + image = ndi.binary_fill_holes(image) * 255. + masks.append(image) + overlayed_masks = np.where(np.sum(masks, axis=0) > 128., 255., 0.).astype(np.uint8) + return overlayed_masks + + +def overlay_centers_from_dir(mask_dirname): + masks = [] + for image_filepath in glob.glob('{}/*'.format(mask_dirname)): + image = np.asarray(Image.open(image_filepath)) + image = ndi.binary_fill_holes(image) + masks.append(get_center(image)) + overlayed_masks = np.where(np.sum(masks, axis=0) > 128., 255., 0.).astype(np.uint8) + return overlayed_masks + + def get_contour(img): img_contour = np.zeros_like(img).astype(np.uint8) _, contours, hierarchy = cv2.findContours(img.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE) - cv2.drawContours(img_contour, contours, -1, (255, 255, 255), 4) + cv2.drawContours(img_contour, contours, -1, (255, 255, 255), 8) return img_contour def get_center(img): - img_center = np.zeros_like(img).astype(np.uint8) - y, x = ndi.measurements.center_of_mass(img) - cv2.circle(img_center, (int(x), int(y)), 4, (255, 255, 255), -1) - return img_center + if img.max() == 0: + return img + else: + img_center = np.zeros_like(img).astype(np.uint8) + y, x = ndi.measurements.center_of_mass(img) + cv2.circle(img_center, (int(x), int(y)), 4, (255, 255, 255), -1) + return img_center def get_vgg_clusters(meta): @@ -138,4 +169,4 @@ def cluster_features(features, n_clusters=10): kmeans = KMeans(n_clusters=n_clusters, random_state=1111) kmeans.fit(features) labels = kmeans.labels_ - return labels + return labels \ No newline at end of file diff --git a/preprocessing.py b/preprocessing.py new file mode 100644 index 0000000..3d20314 --- /dev/null +++ b/preprocessing.py @@ -0,0 +1,196 @@ +import glob + +import numpy as np +import scipy.ndimage as ndi +from PIL import Image +from skimage.color import rgb2grey, rgb2hed +from skimage.exposure import rescale_intensity +from skimage.transform import resize +from sklearn.externals import joblib +from tqdm import tqdm + +from preparation import get_contour +from steps.base import BaseTransformer +from utils import from_pil, to_pil, clip + + +class ImageReader(BaseTransformer): + def __init__(self, x_columns, y_columns): + self.x_columns = x_columns + self.y_columns = y_columns + + def transform(self, meta, train_mode, meta_valid=None): + X, y = self._transform(meta, train_mode) + if meta_valid is not None: + X_valid, y_valid = self._transform(meta_valid, train_mode) + else: + X_valid, y_valid = None, None + + return {'X': X, + 'y': y, + 'X_valid': X_valid, + 'y_valid': y_valid} + + def _transform(self, meta, train_mode): + X_ = meta[self.x_columns].values + + X = self.load_images(X_, grayscale=False) + if train_mode: + y_ = meta[self.y_columns].values + y = self.load_images(y_, grayscale=True) + else: + y = None + + return X, y + + def load_images(self, image_filepaths, grayscale): + X = [] + for i in range(image_filepaths.shape[1]): + column = image_filepaths[:, i] + X.append([]) + for img_filepath in tqdm(column): + img = self.load_image(img_filepath, grayscale=grayscale) + X[i].append(img) + return X + + def load_image(self, img_filepath, grayscale): + image = Image.open(img_filepath, 'r') + if not grayscale: + image = image.convert('RGB') + else: + image = image.convert('L') + return image + + def load(self, filepath): + params = joblib.load(filepath) + self.columns_to_get = params['x_columns'] + self.target_columns = params['y_columns'] + return self + + def save(self, filepath): + params = {'x_columns': self.x_columns, + 'y_columns': self.y_columns + } + joblib.dump(params, filepath) + + +class ImageReaderRescaler(BaseTransformer): + def __init__(self, min_size, max_size, target_ratio): + self.min_size = min_size + self.max_size = max_size + self.target_ratio = target_ratio + + def transform(self, sizes, X, y=None, meta=None): + X, y = self._transform(sizes, X, y, meta) + + return {'X': X, + 'y': y + } + + def load(self, filepath): + return self + + def save(self, filepath): + params = {} + joblib.dump(params, filepath) + + def _transform(self, sizes, X, y=None, meta=None): + raw_images = X[0] + raw_images_adj = [] + for size, raw_image in tqdm(zip(sizes, raw_images)): + h_adj, w_adj = self._get_adjusted_image_size(size, from_pil(raw_image)) + raw_image_adj = resize(from_pil(raw_image), (h_adj, w_adj), preserve_range=True).astype(np.uint8) + raw_images_adj.append(to_pil(raw_image_adj)) + X_adj = [raw_images_adj] + + if y is not None and meta is not None: + masks, contours, centers = y + mask_dirnames = meta['file_path_masks'].tolist() + + masks_adj, contours_adj, centers_adj = [], [], [] + for size, mask, contour, center, mask_dirname in tqdm(zip(sizes, masks, contours, centers, mask_dirnames)): + h_adj, w_adj = self._get_adjusted_image_size(size, from_pil(mask)) + + mask_adj = resize(from_pil(mask), (h_adj, w_adj), preserve_range=True).astype(np.uint8) + center_adj = resize(from_pil(center), (h_adj, w_adj), preserve_range=True).astype(np.uint8) + contour_adj = self._get_contour(mask_dirname, (h_adj, w_adj)) + + masks_adj.append(to_pil(mask_adj)) + contours_adj.append(to_pil(contour_adj)) + centers_adj.append(to_pil(center_adj)) + + y_adj = [masks_adj, contours_adj, centers_adj] + else: + y_adj = None + return X_adj, y_adj + + def _get_adjusted_image_size(self, mean_cell_size, img): + h, w = img.shape[:2] + img_area = h * w + + if mean_cell_size ==0: + adj_ratio = 1.0 + else: + size_ratio = img_area / mean_cell_size + adj_ratio = size_ratio / self.target_ratio + + h_adj = int(clip(self.min_size, h * adj_ratio, self.max_size)) + w_adj = int(clip(self.min_size, w * adj_ratio, self.max_size)) + + return h_adj, w_adj + + def _get_contour(self, mask_dirname, shape_adjusted): + h_adj, w_adj = shape_adjusted + overlayed_masks = np.zeros((h_adj, w_adj)).astype(np.uint8) + for image_filepath in tqdm(glob.glob('{}/*'.format(mask_dirname))): + image = np.asarray(Image.open(image_filepath)) + image = ndi.binary_fill_holes(image) + image = resize(image, (h_adj, w_adj), preserve_range=True).astype(np.uint8) + contour = get_contour(image) + inside_contour = np.where(image & contour, 255, 0).astype(np.uint8) + overlayed_masks += inside_contour + overlayed_masks = np.where(overlayed_masks > 0, 255., 0.).astype(np.uint8) + return overlayed_masks + + +class StainDeconvolution(BaseTransformer): + def __init__(self, mode): + self.mode = mode + + def transform(self, X): + X_deconvoled = [] + for x in X[0]: + x = from_pil(x) + if is_stained(x): + x_deconv = (stain_deconvolve(x, mode=self.mode) * 255).astype(np.uint8) + else: + x_deconv = (rgb2grey(x) * 255).astype(np.uint8) + x_deconv = to_pil(x_deconv) + X_deconvoled.append(x_deconv) + return {'X': [X_deconvoled]} + + +def is_stained(img): + red_mean, green_mean, blue_mean = img.mean(axis=(0, 1)) + if red_mean == green_mean == blue_mean: + return False + else: + return True + + +def stain_deconvolve(img, mode='hematoxylin_eosin_sum'): + img_hed = rgb2hed(img) + if mode == 'hematoxylin_eosin_sum': + h, w = img.shape[:2] + img_hed = rgb2hed(img) + img_he_sum = np.zeros((h, w, 2)) + img_he_sum[:, :, 0] = rescale_intensity(img_hed[:, :, 0], out_range=(0, 1)) + img_he_sum[:, :, 1] = rescale_intensity(img_hed[:, :, 1], out_range=(0, 1)) + img_deconv = rescale_intensity(img_he_sum.sum(axis=2), out_range=(0, 1)) + elif mode == 'hematoxylin': + img_deconv = img_hed[:, :, 0] + elif mode == 'eosin': + img_deconv = img_hed[:, :, 1] + else: + raise NotImplementedError('only hematoxylin_eosin_sum, hematoxylin, eosin modes are supported') + return img_deconv diff --git a/run_end_to_end.sh b/run_end_to_end.sh new file mode 100644 index 0000000..c799add --- /dev/null +++ b/run_end_to_end.sh @@ -0,0 +1,30 @@ +# Prepare metadata +neptune run --config configs_end_to_end/neptune_size_estimator.yaml \ +-- prepare_metadata --train_data --test_data +neptune run --config configs_end_to_end/neptune_size_estimator.yaml \ +-- prepare_masks + +# Train size estimator unet +neptune run --config configs_end_to_end/neptune_size_estimator.yaml \ +-- train_pipeline --pipeline_name patched_unet_training --simple_cv + +#Copy trained transformer from one pipeline to the other +mkdir /mnt/ml-team/dsb_2018/kuba/end_to_end_pipelines/unet_rescaled_patched/transformers +cp /mnt/ml-team/dsb_2018/kuba/end_to_end_pipelines/unet_multitask_size_estimator/transformers/unet_size_estimator \ +/mnt/ml-team/dsb_2018/kuba/end_to_end_pipelines/unet_rescaled_patched/transformers/unet_size_estimator + +# Fit the rescaled unet +neptune run --config configs_end_to_end/neptune_rescaled_patched.yaml \ +-- train_pipeline --pipeline_name scale_adjusted_patched_unet_training --simple_cv + +# Fit the missing transformers (those that are not trainable) +neptune run --config configs_end_to_end/neptune_rescaled_patched.yaml \ +-- train_pipeline --pipeline_name scale_adjusted_patched_unet --simple_cv --dev_mode + +# Evaluate pipeline +neptune run --config configs_end_to_end/neptune_rescaled_patched.yaml \ +-- evaluate_pipeline --pipeline_name scale_adjusted_patched_unet --simple_cv + +# Predict on test set in chunks +neptune run --config configs_end_to_end/neptune_rescaled_patched.yaml \ +-- predict_pipeline --pipeline_name scale_adjusted_patched_unet --chunk_size 50 diff --git a/steps/base.py b/steps/base.py index 5119030..51ac503 100644 --- a/steps/base.py +++ b/steps/base.py @@ -1,5 +1,4 @@ import os -import shutil import pprint import numpy as np @@ -14,7 +13,7 @@ class Step: def __init__(self, name, transformer, input_steps=[], input_data=[], adapter=None, - cache_dirpath=None, cache_output=True, save_output=False, load_saved_output=False, + cache_dirpath=None, cache_output=False, save_output=False, load_saved_output=False, save_graph=False, force_fitting=False): self.name = name self.transformer = transformer @@ -245,7 +244,7 @@ def load(self, filepath): return self def save(self, filepath): - pass + joblib.dump({}, filepath) class MockTransformer(BaseTransformer): @@ -277,6 +276,10 @@ def save(self, filepath): joblib.dump({}, filepath) +def to_dict_inputs(inputs, keys): + return {key: input for key, input in zip(keys, inputs)} + + def to_tuple_inputs(inputs): return tuple(inputs) diff --git a/steps/preprocessing.py b/steps/preprocessing.py index 6b8e738..e4be754 100644 --- a/steps/preprocessing.py +++ b/steps/preprocessing.py @@ -1,7 +1,3 @@ -import os - -from tqdm import tqdm -from PIL import Image from sklearn.externals import joblib from sklearn.feature_extraction import text @@ -36,57 +32,6 @@ def save(self, filepath): joblib.dump(params, filepath) -class ImageReader(BaseTransformer): - def __init__(self, x_columns, y_columns, target_shape): - self.x_columns = x_columns - self.y_columns = y_columns - self.target_shape = target_shape - - def transform(self, meta, train_mode): - X_ = meta[self.x_columns].values - - X = self.load_images(X_, grayscale=False) - if train_mode: - y_ = meta[self.y_columns].values - y = self.load_images(y_, grayscale=True) - else: - y = None - - return {'X': X, - 'y': y} - - def load_images(self, image_filepaths, grayscale): - X = [] - for i in range(image_filepaths.shape[1]): - column = image_filepaths[:, i] - X.append([]) - for img_filepath in tqdm(column): - img = self.load_image(img_filepath, grayscale=grayscale) - X[i].append(img) - return X - - def load_image(self, img_filepath, grayscale): - image = Image.open(img_filepath, 'r') - if not grayscale: - image = image.convert('RGB') - else: - image = image.convert('L') - image = image.resize(self.target_shape) - return image - - def load(self, filepath): - params = joblib.load(filepath) - self.columns_to_get = params['x_columns'] - self.target_columns = params['y_columns'] - return self - - def save(self, filepath): - params = {'x_columns': self.x_columns, - 'y_columns': self.y_columns - } - joblib.dump(params, filepath) - - class TfidfVectorizer(BaseTransformer): def __init__(self, **kwargs): self.vectorizer = text.TfidfVectorizer(**kwargs) diff --git a/steps/pytorch/callbacks.py b/steps/pytorch/callbacks.py index d58ebf0..7e8b5a7 100644 --- a/steps/pytorch/callbacks.py +++ b/steps/pytorch/callbacks.py @@ -131,6 +131,7 @@ def on_batch_end(self, metrics, *args, **kwargs): self.epoch_loss_averagers[name].send(loss) else: self.epoch_loss_averagers[name] = Averager() + self.epoch_loss_averagers[name].send(loss) if self.batch_every and ((self.batch_id % self.batch_every) == 0): logger.info('epoch {0} batch {1} {2}: {3:.5f}'.format(self.epoch_id, self.batch_id, name, loss)) @@ -290,6 +291,7 @@ def on_batch_end(self, metrics, *args, **kwargs): self.epoch_loss_averagers[name].send(loss) else: self.epoch_loss_averagers[name] = Averager() + self.epoch_loss_averagers[name].send(loss) self.ctx.channel_send('{} batch {} loss'.format(self.model_name, name), x=self.batch_id, y=loss) diff --git a/steps/pytorch/loaders.py b/steps/pytorch/loaders.py index a48d1f5..efc2a3e 100644 --- a/steps/pytorch/loaders.py +++ b/steps/pytorch/loaders.py @@ -24,7 +24,7 @@ def __init__(self, X, y, image_transform, target_transform, image_augment): self.target_transform = target_transform def load_image(self, img_filepath): - image = np.asarray(Image.open(img_filepath))[:, :, 0] + image = np.asarray(Image.open(img_filepath)) image = image / 255.0 return image diff --git a/steps/pytorch/models.py b/steps/pytorch/models.py index 62df8b3..1e6ef55 100644 --- a/steps/pytorch/models.py +++ b/steps/pytorch/models.py @@ -1,12 +1,12 @@ -from functools import partial +import os import shutil +from functools import partial import numpy as np import torch import torch.nn as nn from torch.autograd import Variable from torch.nn import init -from tqdm import tqdm from steps.base import BaseTransformer from steps.utils import get_logger @@ -39,6 +39,8 @@ def _initialize_model_weights(self): weights_init_func = partial(init_weights_normal, **weights_init_config['params']) elif weights_init_config['function'] == 'xavier': weights_init_func = init_weights_xavier + elif weights_init_config['function'] == 'he': + weights_init_func = init_weights_he else: raise NotImplementedError @@ -89,6 +91,13 @@ def _fit_loop(self, data): outputs_batch = self.model(X) partial_batch_losses = {} + assert len(targets_tensors) == len(outputs_batch) == len(self.loss_function),\ + '''Number of targets, model outputs and elements of loss function must equal. + You have n_targets={0}, n_model_outputs={1}, n_loss_function_elements={2}. + The order of elements must also be preserved.'''.format(len(targets_tensors), + len(outputs_batch), + len(self.loss_function)) + if len(self.output_names) == 1: for (name, loss_function, weight), target in zip(self.loss_function, targets_var): batch_loss = loss_function(outputs_batch, target) * weight @@ -126,6 +135,7 @@ def _transform(self, datagen, validation_datagen=None): outputs.setdefault(name, []).append(output_) if batch_id == steps: break + self.model.train() outputs = {'{}_prediction'.format(name): np.vstack(outputs_) for name, outputs_ in outputs.items()} return outputs @@ -148,8 +158,10 @@ def save(self, filepath): checkpoint_callback = self.callbacks_config.get('model_checkpoint') if checkpoint_callback: checkpoint_filepath = checkpoint_callback['filepath'] - shutil.copyfile(checkpoint_filepath, filepath) - + if os.path.exists(checkpoint_filepath): + shutil.copyfile(checkpoint_filepath, filepath) + else: + save_model(self.model, filepath) else: save_model(self.model, filepath) @@ -180,3 +192,9 @@ def init_weights_xavier(model): if isinstance(model, nn.Conv2d): init.xavier_normal(model.weight) init.constant(model.bias, 0) + +def init_weights_he(model): + if isinstance(model, nn.Conv2d): + init.kaiming_normal(model.weight) + init.constant(model.bias, 0) + diff --git a/steps/pytorch/utils.py b/steps/pytorch/utils.py index 695db39..4e85b83 100644 --- a/steps/pytorch/utils.py +++ b/steps/pytorch/utils.py @@ -91,7 +91,10 @@ def send(self, value): @property def value(self): - return 1.0 * self.current_total / self.iterations + if self.iterations == 0: + return 0 + else: + return 1.0 * self.current_total / self.iterations def reset(self): self.current_total = 0.0 diff --git a/steps/pytorch/validation.py b/steps/pytorch/validation.py index 8c0fedb..5876f15 100644 --- a/steps/pytorch/validation.py +++ b/steps/pytorch/validation.py @@ -1,14 +1,13 @@ -from sklearn.metrics import accuracy_score import torch -from torch.autograd import Variable -import torch.nn.functional as F import torch.nn as nn +import torch.nn.functional as F +from sklearn.metrics import accuracy_score +from torch.autograd import Variable class DiceLoss(nn.Module): def __init__(self): super(DiceLoss, self).__init__() - self.sigmoid = nn.Sigmoid() def forward(self, output, target): @@ -16,10 +15,10 @@ def forward(self, output, target): return 1 - 2 * torch.sum(prediction * target) / (torch.sum(prediction) + torch.sum(target) + 1e-7) -def segmentation_loss(output, target): +def segmentation_loss(output, target, weight_bce=1.0, weight_dice=1.0): bce = nn.BCEWithLogitsLoss() dice = DiceLoss() - return bce(output, target) + dice(output, target) + return weight_bce*bce(output, target) + weight_dice*dice(output, target) def cross_entropy(output, target, squeeze=False): @@ -74,7 +73,6 @@ def score_model(model, loss_function, datagen): partial_batch_losses.setdefault('sum', []).append(loss_sum) if batch_id == steps: break - average_losses = {name: sum(losses) / steps for name, losses in partial_batch_losses.items()} return average_losses @@ -83,7 +81,6 @@ def torch_acc_score(output, target): output = output.data.cpu().numpy() y_true = target.numpy() y_pred = output.argmax(axis=1) - return accuracy_score(y_true, y_pred) diff --git a/utils.py b/utils.py index b16d3ac..fa88f27 100644 --- a/utils.py +++ b/utils.py @@ -1,11 +1,14 @@ import glob import logging import os +import random import sys from itertools import product +import math import numpy as np import pandas as pd +import torch import yaml from PIL import Image from attrdict import AttrDict @@ -18,7 +21,7 @@ def read_yaml(filepath): return AttrDict(config) -def get_logger(): +def init_logger(): logger = logging.getLogger('dsb-2018') logger.setLevel(logging.INFO) message_format = logging.Formatter(fmt='%(asctime)s %(name)s >>> %(message)s', @@ -36,6 +39,10 @@ def get_logger(): return logger +def get_logger(): + return logging.getLogger('dsb-2018') + + def decompose(labeled): nr_true = labeled.max() masks = [] @@ -51,7 +58,7 @@ def decompose(labeled): return masks -def create_submission(experiments_dir, meta, predictions, logger): +def create_submission(meta, predictions, logger): image_ids, encodings = [], [] output = [] for image_id, prediction in zip(meta['ImageId'].values, predictions): @@ -67,10 +74,7 @@ def create_submission(experiments_dir, meta, predictions, logger): submission = pd.DataFrame(output, columns=['ImageId', 'EncodedPixels']).astype(str) submission = submission[submission['EncodedPixels'] != 'nan'] - submission_filepath = os.path.join(experiments_dir, 'submission.csv') - submission.to_csv(submission_filepath, index=None, encoding='utf-8') - logger.info('submission saved to {}'.format(submission_filepath)) - logger.info('submission head \n\n{}'.format(submission.head())) + return submission def read_masks(masks_filepaths): @@ -117,15 +121,17 @@ def read_params(ctx): def generate_metadata(data_dir, masks_overlayed_dir, contours_overlayed_dir, - contours_touching_overlayed_dir, - centers_overlayed_dir): - def stage1_generate_metadata(train): + centers_overlayed_dir, + competition_stage=1, + process_train_data=True, + process_test_data=True): + def _generate_metadata(train): df_metadata = pd.DataFrame(columns=['ImageId', 'file_path_image', 'file_path_masks', 'file_path_mask', 'is_train', 'width', 'height', 'n_nuclei']) if train: - tr_te = 'stage1_train' + tr_te = 'stage{}_train'.format(competition_stage) else: - tr_te = 'stage1_test' + tr_te = 'stage{}_test'.format(competition_stage) for image_id in sorted(os.listdir(os.path.join(data_dir, tr_te))): p = os.path.join(data_dir, tr_te, image_id, 'images') @@ -140,7 +146,6 @@ def stage1_generate_metadata(train): file_path_masks = os.path.join(data_dir, tr_te, image_id, 'masks') file_path_mask = os.path.join(masks_overlayed_dir, tr_te, image_id + '.png') file_path_contours = os.path.join(contours_overlayed_dir, tr_te, image_id + '.png') - file_path_contours_touching = os.path.join(contours_touching_overlayed_dir, tr_te, image_id + '.png') file_path_centers = os.path.join(centers_overlayed_dir, tr_te, image_id + '.png') n_nuclei = len(os.listdir(file_path_masks)) else: @@ -163,7 +168,6 @@ def stage1_generate_metadata(train): 'file_path_masks': file_path_masks, 'file_path_mask': file_path_mask, 'file_path_contours': file_path_contours, - 'file_path_contours_touching': file_path_contours_touching, 'file_path_centers': file_path_centers, 'is_train': is_train, 'width': width, @@ -171,9 +175,17 @@ def stage1_generate_metadata(train): 'n_nuclei': n_nuclei}, ignore_index=True) return df_metadata - train_metadata = stage1_generate_metadata(train=True) - test_metadata = stage1_generate_metadata(train=False) - metadata = train_metadata.append(test_metadata, ignore_index=True) + if process_train_data and process_test_data: + train_metadata = _generate_metadata(train=True) + test_metadata = _generate_metadata(train=False) + metadata = train_metadata.append(test_metadata, ignore_index=True) + elif process_train_data and not process_test_data: + metadata = _generate_metadata(train=True) + elif not process_train_data and process_test_data: + metadata = _generate_metadata(train=False) + else: + raise ValueError('both train_data and test_data cannot be set to False') + return metadata @@ -216,8 +228,37 @@ def relabel_random_colors(img, max_colours=1000): def from_pil(*images): - return [np.array(image) for image in images] + images = [np.array(image) for image in images] + if len(images) == 1: + return images[0] + else: + return images def to_pil(*images): - return [Image.fromarray((image).astype(np.uint8)) for image in images] + images = [Image.fromarray((image).astype(np.uint8)) for image in images] + if len(images) == 1: + return images[0] + else: + return images + + +def clip(lo, x, hi): + return lo if x <= lo else hi if x >= hi else x + + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + + +def generate_data_frame_chunks(meta, chunk_size): + n_rows = meta.shape[0] + chunk_nr = math.ceil(n_rows / chunk_size) + meta_chunks = [] + for i in tqdm(range(chunk_nr)): + meta_chunk = meta.iloc[i * chunk_size:(i + 1) * chunk_size] + yield meta_chunk