Skip to content

Commit

Permalink
Add BreaKHis dataset (#749)
Browse files Browse the repository at this point in the history
  • Loading branch information
nkaenzig authored Feb 4, 2025
1 parent 6f6b45f commit 2d81aaa
Show file tree
Hide file tree
Showing 15 changed files with 548 additions and 1 deletion.
114 changes: 114 additions & 0 deletions configs/vision/pathology/offline/classification/breakhis.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
---
trainer:
class_path: eva.Trainer
init_args:
n_runs: &N_RUNS ${oc.env:N_RUNS, 5}
default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/offline/breakhis}
max_steps: &MAX_STEPS ${oc.env:MAX_STEPS, 12500}
checkpoint_type: ${oc.env:CHECKPOINT_TYPE, best}
callbacks:
- class_path: eva.callbacks.ConfigurationLogger
- class_path: lightning.pytorch.callbacks.TQDMProgressBar
init_args:
refresh_rate: ${oc.env:TQDM_REFRESH_RATE, 1}
- class_path: lightning.pytorch.callbacks.LearningRateMonitor
init_args:
logging_interval: epoch
- class_path: lightning.pytorch.callbacks.ModelCheckpoint
init_args:
filename: best
save_last: true
save_top_k: 1
monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/MulticlassAccuracy}
mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
- class_path: lightning.pytorch.callbacks.EarlyStopping
init_args:
min_delta: 0
patience: ${oc.env:PATIENCE, 105}
monitor: *MONITOR_METRIC
mode: *MONITOR_METRIC_MODE
- class_path: eva.callbacks.ClassificationEmbeddingsWriter
init_args:
output_dir: &DATASET_EMBEDDINGS_ROOT ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings}/${oc.env:MODEL_NAME, dino_vits16}/breakhis
dataloader_idx_map:
0: train
1: val
backbone:
class_path: eva.vision.models.ModelFromRegistry
init_args:
model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino}
model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null}
overwrite: false
logger:
- class_path: lightning.pytorch.loggers.TensorBoardLogger
init_args:
save_dir: *OUTPUT_ROOT
name: ""
model:
class_path: eva.HeadModule
init_args:
head:
class_path: torch.nn.Linear
init_args:
in_features: ${oc.env:IN_FEATURES, 384}
out_features: &NUM_CLASSES 8
criterion: torch.nn.CrossEntropyLoss
optimizer:
class_path: torch.optim.AdamW
init_args:
lr: ${oc.env:LR_VALUE, 0.0003}
lr_scheduler:
class_path: torch.optim.lr_scheduler.CosineAnnealingLR
init_args:
T_max: *MAX_STEPS
eta_min: 0.0
metrics:
common:
- class_path: eva.metrics.AverageLoss
- class_path: eva.metrics.MulticlassClassificationMetrics
init_args:
num_classes: *NUM_CLASSES
data:
class_path: eva.DataModule
init_args:
datasets:
train:
class_path: eva.datasets.EmbeddingsClassificationDataset
init_args: &DATASET_ARGS
root: *DATASET_EMBEDDINGS_ROOT
manifest_file: manifest.csv
split: train
val:
class_path: eva.datasets.EmbeddingsClassificationDataset
init_args:
<<: *DATASET_ARGS
split: val
predict:
- class_path: eva.vision.datasets.BreaKHis
init_args: &PREDICT_DATASET_ARGS
root: ${oc.env:DATA_ROOT, ./data/breakhis}
split: train
download: ${oc.env:DOWNLOAD_DATA, false}
# Set `download: true` to download the dataset from https://zenodo.org/records/1214456
# The BreaKHis dataset is distributed under the following license: "CC BY 4.0"
# (see: https://creativecommons.org/licenses/by/4.0/)
transforms:
class_path: eva.vision.data.transforms.common.ResizeAndCrop
init_args:
mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]}
std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]}
- class_path: eva.vision.datasets.BreaKHis
init_args:
<<: *PREDICT_DATASET_ARGS
split: val
dataloaders:
train:
batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 256}
num_workers: &N_DATA_WORKERS ${oc.env:N_DATA_WORKERS, 4}
shuffle: true
val:
batch_size: *BATCH_SIZE
num_workers: *N_DATA_WORKERS
predict:
batch_size: &PREDICT_BATCH_SIZE ${oc.env:PREDICT_BATCH_SIZE, 64}
num_workers: *N_DATA_WORKERS
94 changes: 94 additions & 0 deletions configs/vision/pathology/online/classification/breakhis.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
---
trainer:
class_path: eva.Trainer
init_args:
n_runs: &N_RUNS ${oc.env:N_RUNS, 5}
default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/online/breakhis}
max_steps: &MAX_STEPS ${oc.env:MAX_STEPS, 12500}
checkpoint_type: ${oc.env:CHECKPOINT_TYPE, best}
callbacks:
- class_path: eva.callbacks.ConfigurationLogger
- class_path: lightning.pytorch.callbacks.TQDMProgressBar
init_args:
refresh_rate: ${oc.env:TQDM_REFRESH_RATE, 1}
- class_path: lightning.pytorch.callbacks.LearningRateMonitor
init_args:
logging_interval: epoch
- class_path: lightning.pytorch.callbacks.ModelCheckpoint
init_args:
filename: best
save_last: true
save_top_k: 1
monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/MulticlassAccuracy}
mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
- class_path: lightning.pytorch.callbacks.EarlyStopping
init_args:
min_delta: 0
patience: ${oc.env:PATIENCE, 105}
monitor: *MONITOR_METRIC
mode: *MONITOR_METRIC_MODE
logger:
- class_path: lightning.pytorch.loggers.TensorBoardLogger
init_args:
save_dir: *OUTPUT_ROOT
name: ""
model:
class_path: eva.HeadModule
init_args:
backbone:
class_path: eva.vision.models.ModelFromRegistry
init_args:
model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino}
model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null}
head:
class_path: torch.nn.Linear
init_args:
in_features: ${oc.env:IN_FEATURES, 384}
out_features: &NUM_CLASSES 8
criterion: torch.nn.CrossEntropyLoss
optimizer:
class_path: torch.optim.AdamW
init_args:
lr: ${oc.env:LR_VALUE, 0.0003}
lr_scheduler:
class_path: torch.optim.lr_scheduler.CosineAnnealingLR
init_args:
T_max: *MAX_STEPS
eta_min: 0.0
metrics:
common:
- class_path: eva.metrics.AverageLoss
- class_path: eva.metrics.MulticlassClassificationMetrics
init_args:
num_classes: *NUM_CLASSES
data:
class_path: eva.DataModule
init_args:
datasets:
train:
class_path: eva.vision.datasets.BreaKHis
init_args: &DATASET_ARGS
root: ${oc.env:DATA_ROOT, ./data/breakhis}
split: train
download: ${oc.env:DOWNLOAD_DATA, false}
# Set `download: true` to download the dataset from https://zenodo.org/records/1214456
# The BreaKHis dataset is distributed under the following license: "CC BY 4.0"
# (see: https://creativecommons.org/licenses/by/4.0/)
transforms:
class_path: eva.vision.data.transforms.common.ResizeAndCrop
init_args:
mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]}
std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]}
val:
class_path: eva.vision.datasets.BreaKHis
init_args:
<<: *DATASET_ARGS
split: val
dataloaders:
train:
batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 256}
num_workers: &N_DATA_WORKERS ${oc.env:N_DATA_WORKERS, 4}
shuffle: true
val:
batch_size: *BATCH_SIZE
num_workers: *N_DATA_WORKERS
59 changes: 59 additions & 0 deletions docs/datasets/breakhis.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# BreakHis

The Breast Cancer Histopathological Image Classification (BreakHis) is composed of 9,109 microscopic images of breast tumor tissue collected from 82 patients using different magnifying factors (40X, 100X, 200X, and 400X). For this benchmark we only use the 40X samples which results in a subset of 1,995 images. This database has been built in collaboration with the P&D Laboratory, Pathological Anatomy and Cytopathology, Parana, Brazil.

The dataset is divided into two main groups: benign tumors and malignant tumors. The dataset currently contains four histological distinct types of benign breast tumors: adenosis (A), fibroadenoma (F), phyllodes tumor (PT), and tubular adenona (TA); and four malignant tumors (breast cancer): carcinoma (DC), lobular carcinoma (LC), mucinous carcinoma (MC) and papillary carcinoma (PC).

## Raw data

### Key stats

| | |
|--------------------------------|-----------------------------|
| **Modality** | Vision (WSI patches) |
| **Task** | Multiclass classification (8 classes) |
| **Cancer type** | Breast |
| **Data size** | 4 GB |
| **Image dimension** | 700 x 460 |
| **Magnification (μm/px)** | 40x (0.25) |
| **Files format** | `png` |
| **Number of images** | 1995 |


### Splits

The data source provides train/validation splits

| Splits | Train | Validation |
|----------|---------------|--------------|
| #Samples | 1393 (70%) | 602 (30%) |

A test split is not provided, as by further dividing the dataset the number of samples per class becomes too low for robust evaluations. __eva__ therefore reports evaluation results for BreakHis on the validation split.


### Organization

The BreakHis data is organized as follows:

```
BreaKHis_v1
├── histology_slides
│ ├── breast
| │ ├── benign
| │ | ├── SOB
| │ | | ├── adenosis
| │ | | ├── fibroadenoma
| │ | | └── ...
```


## Download and preprocessing
The `BreakHis` dataset class supports downloading the data during runtime through setting the environment variable `DOWNLOAD_DATA=true`.

## Relevant links

* [Official Source](https://web.inf.ufpr.br/vri/databases/breast-cancer-histopathological-database-breakhis/)

## License

[CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)
1 change: 1 addition & 0 deletions docs/datasets/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
|------------------------------------|----------|------------|------------------------|----------------------------|------------------|
| [BACH](bach.md) | 400 | 2048x1536 | 20x (0.5) | Classification (4 classes) | Breast |
| [BRACS](bracs.md) | 4539 | variable | 40x (0.25) | Classification (7 classes) | Breast |
| [BreakHis](breakhis.md) | 1995 | 700x460 | 40x (0.25) | Classification (8 classes) | Breast |
| [CRC](crc.md) | 107,180 | 224x224 | 20x (0.5) | Classification (9 classes) | Colorectal |
| [GleasonArvaniti](crc.md) | 22,752 | 750x750 | 40x (0.23) | Classification (4 classes) | Prostate |
| [PatchCamelyon](patch_camelyon.md) | 327,680 | 96x96 | 10x (1.0) \* | Classification (2 classes) | Breast |
Expand Down
2 changes: 1 addition & 1 deletion docs/leaderboards.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ We selected this approach to prioritize reliable, robust and fair FM-evaluation
| **Base batch size** | 256 | 32 | 64 |
| **Base learning rate** | 0.0003 | 0.001 | 0.002 |
| **Early stopping** | 5% * [Max epochs] | 10% * [Max epochs] (2) | 10% * [Max epochs] (2) |
| **Optimizer** | SGD | AdamW | AdamW |
| **Optimizer** | AdamW | AdamW | AdamW |
| **Momentum** | 0.9 | n/a | n/a |
| **Weight Decay** | 0.0 | n/a | n/a |
| **betas** | n/a | [0.9, 0.999] | [0.9, 0.999] |
Expand Down
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ nav:
- Patch-level:
- BACH: datasets/bach.md
- BRACS: datasets/bracs.md
- BreakHis: datasets/breakhis.md
- CRC: datasets/crc.md
- GleasonArvaniti: datasets/gleason_arvaniti.md
- MHIST: datasets/mhist.md
Expand Down
2 changes: 2 additions & 0 deletions src/eva/vision/data/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
CRC,
MHIST,
PANDA,
BreaKHis,
Camelyon16,
GleasonArvaniti,
PANDASmall,
Expand All @@ -28,6 +29,7 @@
__all__ = [
"BACH",
"BCSS",
"BreaKHis",
"BRACS",
"CRC",
"GleasonArvaniti",
Expand Down
2 changes: 2 additions & 0 deletions src/eva/vision/data/datasets/classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from eva.vision.data.datasets.classification.bach import BACH
from eva.vision.data.datasets.classification.bracs import BRACS
from eva.vision.data.datasets.classification.breakhis import BreaKHis
from eva.vision.data.datasets.classification.camelyon16 import Camelyon16
from eva.vision.data.datasets.classification.crc import CRC
from eva.vision.data.datasets.classification.gleason_arvaniti import GleasonArvaniti
Expand All @@ -12,6 +13,7 @@

__all__ = [
"BACH",
"BreaKHis",
"BRACS",
"Camelyon16",
"CRC",
Expand Down
Loading

0 comments on commit 2d81aaa

Please sign in to comment.