Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BioMassters Dataset #1560

Merged
merged 18 commits into from
Sep 29, 2023
5 changes: 5 additions & 0 deletions docs/api/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,11 @@ BigEarthNet

.. autoclass:: BigEarthNet

BioMassters
^^^^^^^^^^^

.. autoclass:: BioMassters

Cloud Cover Detection
^^^^^^^^^^^^^^^^^^^^^

Expand Down
1 change: 1 addition & 0 deletions docs/api/non_geo_datasets.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ Dataset,Task,Source,# Samples,# Classes,Size (px),Resolution (m),Bands
`ADVANCE`_,C,"Google Earth, Freesound","5,075",13,512x512,0.5,RGB
`Benin Cashew Plantations`_,S,Airbus Pléiades,70,6,"1,122x1,186",10,MSI
`BigEarthNet`_,C,Sentinel-1/2,"590,326",19--43,120x120,10,"SAR, MSI"
`BioMassters`_,R,Sentinel-1/2 and Lidar,,256, 10, "SAR, MSI"
`Cloud Cover Detection`_,S,Sentinel-2,"22,728",2,512x512,10,MSI
`COWC`_,"C, R","CSUAV AFRL, ISPRS, LINZ, AGRC","388,435",2,256x256,0.15,RGB
`Kenya Crop Type`_,S,Sentinel-2,"4,688",7,"3,035x2,016",10,MSI
Expand Down
21 changes: 21 additions & 0 deletions tests/data/biomassters/The_BioMassters_-_features_metadata.csv.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
filename,chip_id,satellite,split,month,size,cksum,s3path_us,s3path_eu,s3path_as,corresponding_agbm
0003d2eb_S1_00.tif,0003d2eb,S1,train,September,0,0,path,path,path,0003d2eb_agbm.tif
0003d2eb_S1_01.tif,0003d2eb,S1,train,October,0,0,path,path,path,0003d2eb_agbm.tif
0003d2eb_S1_02.tif,0003d2eb,S1,train,November,0,0,path,path,path,0003d2eb_agbm.tif
0003d2eb_S2_00.tif,0003d2eb,S2,train,September,0,0,path,path,path,0003d2eb_agbm.tif
0003d2eb_S2_02.tif,0003d2eb,S2,train,November,0,0,path,path,path,0003d2eb_agbm.tif
000aa810_S1_00.tif,000aa810,S1,train,September,0,0,path,path,path,000aa810_agbm.tif
000aa810_S1_01.tif,000aa810,S1,train,October,0,0,path,path,path,000aa810_agbm.tif
000aa810_S1_02.tif,000aa810,S1,train,November,0,0,path,path,path,000aa810_agbm.tif
000aa810_S2_00.tif,000aa810,S2,train,September,0,0,path,path,path,000aa810_agbm.tif
000aa810_S2_02.tif,000aa810,S2,train,November,0,0,path,path,path,000aa810_agbm.tif
0003d2eb_S1_00.tif,0003d2eb,S1,test,September,0,0,path,path,path,0003d2eb_agbm.tif
0003d2eb_S1_01.tif,0003d2eb,S1,test,October,0,0,path,path,path,0003d2eb_agbm.tif
0003d2eb_S1_02.tif,0003d2eb,S1,test,November,0,0,path,path,path,0003d2eb_agbm.tif
0003d2eb_S2_00.tif,0003d2eb,S2,test,September,0,0,path,path,path,0003d2eb_agbm.tif
0003d2eb_S2_02.tif,0003d2eb,S2,test,November,0,0,path,path,path,0003d2eb_agbm.tif
000aa810_S1_00.tif,000aa810,S1,test,September,0,0,path,path,path,000aa810_agbm.tif
000aa810_S1_01.tif,000aa810,S1,test,October,0,0,path,path,path,000aa810_agbm.tif
000aa810_S1_02.tif,000aa810,S1,test,November,0,0,path,path,path,000aa810_agbm.tif
000aa810_S2_00.tif,000aa810,S2,test,September,0,0,path,path,path,000aa810_agbm.tif
000aa810_S2_02.tif,000aa810,S2,test,November,0,0,path,path,path,000aa810_agbm.tif
137 changes: 137 additions & 0 deletions tests/data/biomassters/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#!/usr/bin/env python3

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import csv
import hashlib
import os
import shutil

import numpy as np
import rasterio

metadata_train = "The_BioMassters_-_features_metadata.csv.csv"
adamjstewart marked this conversation as resolved.
Show resolved Hide resolved

csv_columns = [
"filename",
"chip_id",
"satellite",
"split",
"month",
"size",
"cksum",
"s3path_us",
"s3path_eu",
"s3path_as",
"corresponding_agbm",
]

targets = "train_agbm.zip"

splits = ["train", "test"]

sample_ids = ["0003d2eb", "000aa810"]

months = ["September", "October", "November"]

satellite = ["S1", "S2"]

SIZE = 32


def create_tif_file(path: str, num_channels: int, dtype: str) -> None:
"""Create S1 or S2 data with num channels.

Args:
path: path where to save tif
num_channels: number of channels (4 for S1, 11 for S2)
dtype: uint16 for image data and float 32 for target
"""
profile = {}
profile["driver"] = "GTiff"
profile["dtype"] = dtype
profile["count"] = num_channels
profile["crs"] = "epsg:4326"
profile["transform"] = rasterio.transform.from_bounds(0, 0, 1, 1, 1, 1)
profile["height"] = SIZE
profile["width"] = SIZE
profile["compress"] = "lzw"
profile["predictor"] = 2

if "float" in profile["dtype"]:
Z = np.random.randn(SIZE, SIZE).astype(profile["dtype"])
else:
Z = np.random.randint(
np.iinfo(profile["dtype"]).max, size=(SIZE, SIZE), dtype=profile["dtype"]
)

with rasterio.open(path, "w", **profile) as src:
for i in range(1, profile["count"] + 1):
src.write(Z, i)


# filename,chip_id,satellite,split,month,size,cksum,s3path_us,s3path_eu,s3path_as,corresponding_agbm
if __name__ == "__main__":
csv_rows = []
for split in splits:
os.makedirs(f"{split}_features", exist_ok=True)
if split == "train":
os.makedirs("train_agbm", exist_ok=True)
for id in sample_ids:
for sat in satellite:
path = id + "_" + str(sat)
for idx, month in enumerate(months):
# S2 data is not present for every month
if sat == "S2" and idx == 1:
continue
file_path = path + "_" + f"{idx:02d}" + ".tif"

csv_rows.append(
[
file_path,
id,
sat,
split,
month,
"0",
"0",
"path",
"path",
"path",
id + "_agbm.tif",
]
)

# file path to save
file_path = os.path.join(f"{split}_features", file_path)

if sat == "S1":
create_tif_file(file_path, num_channels=4, dtype="uint16")
else:
create_tif_file(file_path, num_channels=11, dtype="uint16")

# create target data one per id
if split == "train":
create_tif_file(
os.path.join(f"{split}_agbm", id + "_agbm.tif"),
num_channels=1,
dtype="float32",
)

# write out metadata

with open(metadata_train, "w") as csv_file:
wr = csv.writer(csv_file)
wr.writerow(csv_columns)
for row in csv_rows:
wr.writerow(row)

# zip up feature and target folders
zip_dirs = ["train_features", "test_features", "train_agbm"]
for dir in zip_dirs:
shutil.make_archive(dir, "zip", dir)
# Compute checksums
with open(dir + ".zip", "rb") as f:
md5 = hashlib.md5(f.read()).hexdigest()
print(f"{dir}: {md5}")
Binary file added tests/data/biomassters/test_features.zip
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added tests/data/biomassters/train_agbm.zip
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added tests/data/biomassters/train_features.zip
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
51 changes: 51 additions & 0 deletions tests/datasets/test_biomassters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.


import os
from itertools import product
from pathlib import Path

import matplotlib.pyplot as plt
import pytest
from _pytest.fixtures import SubRequest

from torchgeo.datasets import BioMassters


class TestBioMassters:
@pytest.fixture(
params=product(["train", "test"], [["S1"], ["S2"], ["S1", "S2"]], [True, False])
)
def dataset(self, request: SubRequest) -> BioMassters:
root = os.path.join("tests", "data", "biomassters")
split, sensors, as_time_series = request.param
return BioMassters(
root, split=split, sensors=sensors, as_time_series=as_time_series
)

def test_len_of_ds(self, dataset: BioMassters) -> None:
assert len(dataset) > 0

def test_invalid_split(self, dataset: BioMassters) -> None:
with pytest.raises(AssertionError):
BioMassters(dataset.root, split="foo")

def test_invalid_bands(self, dataset: BioMassters) -> None:
with pytest.raises(AssertionError):
BioMassters(dataset.root, sensors=["S3"])

def test_not_downloaded(self, tmp_path: Path) -> None:
match = "Dataset not found"
with pytest.raises(RuntimeError, match=match):
BioMassters(str(tmp_path))

def test_plot(self, dataset: BioMassters) -> None:
dataset.plot(dataset[0], suptitle="Test")
plt.close()

sample = dataset[0]
if dataset.split == "train":
sample["prediction"] = sample["label"]
dataset.plot(sample)
plt.close()
2 changes: 2 additions & 0 deletions torchgeo/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .astergdem import AsterGDEM
from .benin_cashews import BeninSmallHolderCashews
from .bigearthnet import BigEarthNet
from .biomassters import BioMassters
from .cbf import CanadianBuildingFootprints
from .cdl import CDL
from .chesapeake import (
Expand Down Expand Up @@ -173,6 +174,7 @@
"ADVANCE",
"BeninSmallHolderCashews",
"BigEarthNet",
"BioMassters",
"CloudCoverDetection",
"COWC",
"COWCCounting",
Expand Down
Loading