microsoft · adamjstewart · Jul 9, 2022 · Mar 9, 2022 · Mar 10, 2022 · Mar 11, 2022
diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst
@@ -224,6 +224,11 @@ LoveDA
 
 .. autoclass:: LoveDA
 
+Million-AID
+^^^^^^^^^^^
+
+.. autoclass:: MillionAID
+
 NASA Marine Debris
 ^^^^^^^^^^^^^^^^^^
 

diff --git a/docs/api/non_geo_datasets.csv b/docs/api/non_geo_datasets.csv
@@ -16,6 +16,7 @@ Dataset,Task,Source,# Samples,# Classes,Size (px),Resolution (m),Bands
 `LandCover.ai`_,S,Aerial,"10,674",5,512x512,0.25--0.5,RGB
 `LEVIR-CD+`_,CD,Google Earth,985,2,"1,024x1,024",0.5,RGB
 `LoveDA`_,S,Google Earth,"5,987",7,"1,024x1,024",0.3,RGB
+`Million-AID`_,C,Google Earth,1M,51--73,,0.5--153,RGB
 `NASA Marine Debris`_,OD,PlanetScope,707,1,256x256,3,RGB
 `OSCD`_,CD,Sentinel-2,24,2,"40--1,180",60,MSI
 `PatternNet`_,C,Google Earth,"30,400",38,256x256,0.06--5,RGB

diff --git a/tests/data/millionaid/data.py b/tests/data/millionaid/data.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import hashlib
+import os
+import shutil
+
+import numpy as np
+from PIL import Image
+
+SIZE = 32
+
+np.random.seed(0)
+
+PATHS = {
+    "train": [
+        os.path.join(
+            "train", "agriculture_land", "grassland", "meadow", "P0115918.jpg"
+        ),
+        os.path.join("train", "water_area", "beach", "P0060208.jpg"),
+    ],
+    "test": [
+        os.path.join("test", "agriculture_land", "grassland", "meadow", "P0115918.jpg"),
+        os.path.join("test", "water_area", "beach", "P0060208.jpg"),
+    ],
+}
+
+
+def create_file(path: str) -> None:
+    Z = np.random.rand(SIZE, SIZE, 3) * 255
+    img = Image.fromarray(Z.astype("uint8")).convert("RGB")
+    img.save(path)
+
+
+if __name__ == "__main__":
+    for split, paths in PATHS.items():
+        # remove old data
+        if os.path.isdir(split):
+            shutil.rmtree(split)
+        for path in paths:
+            os.makedirs(os.path.dirname(path), exist_ok=True)
+            create_file(path)
+
+        # compress data
+        shutil.make_archive(split, "zip", ".", split)
+
+        # Compute checksums
+        with open(split + ".zip", "rb") as f:
+            md5 = hashlib.md5(f.read()).hexdigest()
+            print(f"{split}: {md5}")
diff --git a/tests/data/millionaid/test.zip b/tests/data/millionaid/test.zip
diff --git a/tests/data/millionaid/test/agriculture_land/grassland/meadow/P0115918.jpg b/tests/data/millionaid/test/agriculture_land/grassland/meadow/P0115918.jpg
diff --git a/tests/data/millionaid/test/water_area/beach/P0060208.jpg b/tests/data/millionaid/test/water_area/beach/P0060208.jpg
diff --git a/tests/data/millionaid/train.zip b/tests/data/millionaid/train.zip
diff --git a/tests/data/millionaid/train/agriculture_land/grassland/meadow/P0115918.jpg b/tests/data/millionaid/train/agriculture_land/grassland/meadow/P0115918.jpg
diff --git a/tests/data/millionaid/train/water_area/beach/P0060208.jpg b/tests/data/millionaid/train/water_area/beach/P0060208.jpg
diff --git a/tests/datasets/test_millionaid.py b/tests/datasets/test_millionaid.py
@@ -0,0 +1,64 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import shutil
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import pytest
+import torch
+import torch.nn as nn
+from _pytest.fixtures import SubRequest
+
+from torchgeo.datasets import MillionAID
+
+
+class TestMillionAID:
+    @pytest.fixture(
+        scope="class", params=zip(["train", "test"], ["multi-class", "multi-label"])
+    )
+    def dataset(self, request: SubRequest) -> MillionAID:
+        root = os.path.join("tests", "data", "millionaid")
+        split, task = request.param
+        transforms = nn.Identity()
+        return MillionAID(
+            root=root, split=split, task=task, transforms=transforms, checksum=True
+        )
+
+    def test_getitem(self, dataset: MillionAID) -> None:
+        x = dataset[0]
+        assert isinstance(x, dict)
+        assert isinstance(x["image"], torch.Tensor)
+        assert isinstance(x["label"], torch.Tensor)
+        assert x["image"].shape[0] == 3
+        assert x["image"].ndim == 3
+
+    def test_len(self, dataset: MillionAID) -> None:
+        assert len(dataset) == 2
+
+    def test_not_found(self, tmp_path: Path) -> None:
+        with pytest.raises(RuntimeError, match="Dataset not found in"):
+            MillionAID(str(tmp_path))
+
+    def test_not_extracted(self, tmp_path: Path) -> None:
+        url = os.path.join("tests", "data", "millionaid", "train.zip")
+        shutil.copy(url, tmp_path)
+        MillionAID(str(tmp_path))
+
+    def test_corrupted(self, tmp_path: Path) -> None:
+        with open(os.path.join(tmp_path, "train.zip"), "w") as f:
+            f.write("bad")
+        with pytest.raises(RuntimeError, match="Dataset found, but corrupted."):
+            MillionAID(str(tmp_path), checksum=True)
+
+    def test_plot(self, dataset: MillionAID) -> None:
+        x = dataset[0].copy()
+        dataset.plot(x, suptitle="Test")
+        plt.close()
+
+    def test_plot_prediction(self, dataset: MillionAID) -> None:
+        x = dataset[0].copy()
+        x["prediction"] = x["label"].clone()
+        dataset.plot(x)
+        plt.close()
diff --git a/torchgeo/datasets/__init__.py b/torchgeo/datasets/__init__.py
@@ -68,6 +68,7 @@
 )
 from .levircd import LEVIRCDPlus
 from .loveda import LoveDA
+from .millionaid import MillionAID
 from .naip import NAIP
 from .nasa_marine_debris import NASAMarineDebris
 from .nwpu import VHR10
@@ -162,6 +163,7 @@
     "LandCoverAI",
     "LEVIRCDPlus",
     "LoveDA",
+    "MillionAID",
     "NASAMarineDebris",
     "OSCD",
     "PatternNet",