Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add OpenBuildings dataset #402

Merged
merged 12 commits into from
Feb 27, 2022
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/api/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ National Agriculture Imagery Program (NAIP)

.. autoclass:: NAIP

Open Buildings
^^^^^^^^^^^^^^

.. autoclass:: OpenBuildings

Sentinel
^^^^^^^^

Expand Down
Binary file added tests/data/openbuildings/000_buildings.csv.gz
Binary file not shown.
105 changes: 105 additions & 0 deletions tests/data/openbuildings/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env python3

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import csv
import gzip
import hashlib
import json
import os
import random
import shutil

import numpy as np
from shapely.geometry import Polygon

SIZE = 0.05

np.random.seed(0)
random.seed(0)


def create_meta_data_file(zipfilename):
meta_data = {
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"geometry": {
"type": "Polygon",
"coordinates": [
[[0.0, 0.0], [0.0, SIZE], [SIZE, SIZE], [SIZE, 0.0], [0.0, 0.0]]
],
},
"properties": {
"tile_id": "025",
"tile_url": "polygons_s2_level_4_gzip/{}".format(zipfilename),
"size_mb": 0.2,
},
}
],
}
return meta_data


def create_csv_data_row(lat, long):
width, height = SIZE / 10, SIZE / 10
minx = long - 0.5 * width
maxx = long + 0.5 * width
miny = lat - 0.5 * height
maxy = lat - 0.5 * height
coordinates = [(minx, miny), (minx, maxy), (maxx, maxy), (maxx, miny), (minx, miny)]
polygon = Polygon(coordinates)

data_row = {
"latitude": lat,
"longitude": long,
"area_in_meters": 1.0,
"confidence": 1.0,
"geometry": polygon.wkt,
"full_plus_code": "ABC",
}

return data_row


def create_buildings_data():
fourth = SIZE / 4
# pandas df
dict_data = [
create_csv_data_row(fourth, fourth),
create_csv_data_row(SIZE - fourth, SIZE - fourth),
]
return dict_data


if __name__ == "__main__":
csvname = "000_buildings.csv"
zipfilename = csvname + ".gz"

# create and save metadata
meta_data = create_meta_data_file(zipfilename)
with open("tiles.geojson", "w") as fp:
json.dump(meta_data, fp)

# create and archive buildings data
buildings_data = create_buildings_data()
keys = buildings_data[0].keys()
with open(csvname, "w") as f:
w = csv.DictWriter(f, keys)
w.writeheader()
w.writerows(buildings_data)

# archive the csv to gzip
with open(csvname, "rb") as f_in:
with gzip.open(zipfilename, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)

# Compute checksums
with open(zipfilename, "rb") as f:
md5 = hashlib.md5(f.read()).hexdigest()
print(f"{zipfilename}: {md5}")

# remove csv file
os.remove(csvname)
1 change: 1 addition & 0 deletions tests/data/openbuildings/tiles.geojson
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"type": "FeatureCollection", "features": [{"type": "Feature", "geometry": {"type": "Polygon", "coordinates": [[[0.0, 0.0], [0.0, 0.05], [0.05, 0.05], [0.05, 0.0], [0.0, 0.0]]]}, "properties": {"tile_id": "025", "tile_url": "polygons_s2_level_4_gzip/000_buildings.csv.gz", "size_mb": 0.2}}]}
151 changes: 151 additions & 0 deletions tests/datasets/test_openbuildings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import builtins
import json
import os
import shutil
from pathlib import Path
from typing import Any, Generator

import pandas as pd
import pytest
import torch
import torch.nn as nn
from _pytest.fixtures import SubRequest
from _pytest.monkeypatch import MonkeyPatch
from rasterio.crs import CRS

from torchgeo.datasets import (
BoundingBox,
IntersectionDataset,
OpenBuildings,
UnionDataset,
)


adamjstewart marked this conversation as resolved.
Show resolved Hide resolved
class TestOpenBuildings:
@pytest.fixture
def dataset(
self, monkeypatch: Generator[MonkeyPatch, None, None], tmp_path: Path
) -> OpenBuildings:

root = str(tmp_path)
shutil.copy(
os.path.join("tests", "data", "openbuildings", "tiles.geojson"), root
)
shutil.copy(
os.path.join("tests", "data", "openbuildings", "000_buildings.csv.gz"), root
)

md5s = {"000_buildings.csv.gz": "20aeeec9d45a0ce4d772a26e0bcbc25f"}

monkeypatch.setattr(OpenBuildings, "md5s", md5s) # type: ignore[attr-defined]
transforms = nn.Identity() # type: ignore[attr-defined]
return OpenBuildings(root=root, transforms=transforms)

@pytest.fixture(params=["pandas"])
def mock_missing_module(
self, monkeypatch: Generator[MonkeyPatch, None, None], request: SubRequest
) -> str:
import_orig = builtins.__import__
package = str(request.param)

def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any:
if name == package:
raise ImportError()
return import_orig(name, *args, **kwargs)

monkeypatch.setattr( # type: ignore[attr-defined]
builtins, "__import__", mocked_import
)
return package

def test_mock_missing_module(
self, dataset: OpenBuildings, mock_missing_module: str
) -> None:
package = mock_missing_module

with pytest.raises(
ImportError,
match=f"{package} is not installed and is required to use this dataset",
):
OpenBuildings(root=dataset.root)

def test_no_shapes_to_rasterize(
self, dataset: OpenBuildings, tmp_path: Path
) -> None:
# empty csv buildings file
path = os.path.join(tmp_path, "000_buildings.csv.gz")
df = pd.read_csv(path)
df = pd.DataFrame(columns=df.columns)
df.to_csv(path, compression="gzip")
x = dataset[dataset.bounds]
assert isinstance(x, dict)
assert isinstance(x["crs"], CRS)
assert isinstance(x["mask"], torch.Tensor)

def test_no_building_data_found(self, tmp_path: Path) -> None:
false_root = os.path.join(tmp_path, "empty")
os.makedirs(false_root)
shutil.copy(
os.path.join("tests", "data", "openbuildings", "tiles.geojson"), false_root
)
with pytest.raises(
RuntimeError, match="have manually downloaded the dataset as suggested "
):
OpenBuildings(root=false_root)

def test_corrupted(self, dataset: OpenBuildings, tmp_path: Path) -> None:
with open(os.path.join(tmp_path, "000_buildings.csv.gz"), "w") as f:
f.write("bad")
with pytest.raises(RuntimeError, match="Dataset found, but corrupted."):
OpenBuildings(dataset.root, checksum=True)

def test_no_meta_data_found(self, tmp_path: Path) -> None:
false_root = os.path.join(tmp_path, "empty")
os.makedirs(false_root)
with pytest.raises(FileNotFoundError, match="Meta data file"):
OpenBuildings(root=false_root)

def test_nothing_in_index(self, dataset: OpenBuildings, tmp_path: Path) -> None:
# change meta data to another 'title_url' so that there is no match found
with open(os.path.join(tmp_path, "tiles.geojson"), "r") as f:
content = json.load(f)
content["features"][0]["properties"]["tile_url"] = "mismatch.csv.gz"

with open(os.path.join(tmp_path, "tiles.geojson"), "w") as f:
json.dump(content, f)

with pytest.raises(FileNotFoundError, match="data was found in"):
OpenBuildings(dataset.root)

def test_getitem(self, dataset: OpenBuildings) -> None:
x = dataset[dataset.bounds]
assert isinstance(x, dict)
assert isinstance(x["crs"], CRS)
assert isinstance(x["mask"], torch.Tensor)

def test_and(self, dataset: OpenBuildings) -> None:
ds = dataset & dataset
assert isinstance(ds, IntersectionDataset)

def test_or(self, dataset: OpenBuildings) -> None:
ds = dataset | dataset
assert isinstance(ds, UnionDataset)

def test_invalid_query(self, dataset: OpenBuildings) -> None:
query = BoundingBox(100, 100, 100, 100, 0, 0)
with pytest.raises(
IndexError, match="query: .* not found in index with bounds:"
):
dataset[query]

def test_plot(self, dataset: OpenBuildings) -> None:
x = dataset[dataset.bounds]
dataset.plot(x, suptitle="test")

def test_plot_prediction(self, dataset: OpenBuildings) -> None:
x = dataset[dataset.bounds]
x["prediction"] = x["mask"].clone()
dataset.plot(x, suptitle="Prediction")
2 changes: 2 additions & 0 deletions torchgeo/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
from .naip import NAIP
from .nasa_marine_debris import NASAMarineDebris
from .nwpu import VHR10
from .openbuildings import OpenBuildings
from .oscd import OSCD
from .patternnet import PatternNet
from .potsdam import Potsdam2D
Expand Down Expand Up @@ -110,6 +111,7 @@
"Landsat8",
"Landsat9",
"NAIP",
"OpenBuildings",
"Sentinel",
"Sentinel2",
# VisionDataset
Expand Down
Loading