Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding the CVPR Chesapeake dataset #100

Merged
merged 16 commits into from
Sep 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@ dependencies:
- numpy
- pip
- pycocotools
- pyproj>=2.2.0
- python>=3.6
- pytorch-gpu>=1.7
- rarfile>=3
- rasterio>=1.0.16
- shapely>=1.3.0
- torchvision>=0.3
- pip:
- black>=21.4b0
Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ pillow>=2.9
pycocotools
# pydocstyle 6.1+ required for pyproject.toml support
pydocstyle[toml]>=6.1
# pyproj 2.2.0+ required for CRS object, note: pyproj 3.1+ contains threadsafe Transformer and CRS implementations
pyproj>=2.2.0
# pytest 6+ required for pyproject.toml support
pytest>=6
# pytest-cov 2.4+ required for pytest --cov flags
Expand All @@ -50,6 +52,8 @@ scikit-learn>=0.18
segmentation-models-pytorch>=0.2
# setuptools 30.4+ required for options.packages.find section in setup.cfg
setuptools>=30.4
# shapely 1.3.0+ is required for Python 3.x
shapely>=1.3.0
# sphinx 3+ required for autodoc_typehints = description
sphinx>=3
# torch 1.7+ required for typing
Expand Down
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@ install_requires =
matplotlib
numpy
pillow>=2.9
pyproj>=2.2.0
rasterio>=1.0.16
rtree>=0.5
shapely>=1.3.0
torch>=1.7
torchvision>=0.3
python_requires = >= 3.6
Expand Down
2 changes: 2 additions & 0 deletions spack.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ spack:
- "[email protected]:"
- "py-pycocotools"
- "[email protected]:+toml"
- "[email protected]:"
- "py-pytest@6:"
- "[email protected]:"
- "[email protected]:"
Expand All @@ -29,6 +30,7 @@ spack:
- "[email protected]:"
- "[email protected]:"
- "[email protected]:"
- "[email protected]:"
- "py-sphinx@3:"
- "[email protected]:"
- "py-torchmetrics"
Expand Down
2 changes: 2 additions & 0 deletions torchgeo/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
Chesapeake,
Chesapeake7,
Chesapeake13,
ChesapeakeCVPR,
ChesapeakeDC,
ChesapeakeDE,
ChesapeakeMD,
Expand Down Expand Up @@ -57,6 +58,7 @@
"ChesapeakePA",
"ChesapeakeVA",
"ChesapeakeWV",
"ChesapeakeCVPR",
"Landsat",
"Landsat1",
"Landsat2",
Expand Down
223 changes: 214 additions & 9 deletions torchgeo/datasets/chesapeake.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

"""Chesapeake Bay High-Resolution Land Cover Project dataset."""
"""Chesapeake Bay High-Resolution Land Cover Project datasets."""

import abc
import os
from typing import Any, Callable, Dict, Optional

import sys
from typing import Any, Callable, Dict, List, Optional

import fiona
import pyproj
import rasterio
import rasterio.mask
import shapely.geometry
import shapely.ops
from rasterio.crs import CRS

from .geo import RasterDataset
from .utils import check_integrity, download_and_extract_archive
from .geo import GeoDataset, RasterDataset
from .utils import BoundingBox, check_integrity, download_and_extract_archive


class Chesapeake(RasterDataset, abc.ABC):
Expand All @@ -33,10 +40,6 @@ class Chesapeake(RasterDataset, abc.ABC):
<https://chesapeakeconservancy.org/wp-content/uploads/2020/03/LC_Class_Descriptions.pdf>`_
* `Accuracy Assessment
<https://chesapeakeconservancy.org/wp-content/uploads/2017/01/Chesapeake_Conservancy_Accuracy_Assessment_Methodology.pdf>`_

If you use this dataset in your research, please cite the following paper:

* https://doi.org/10.1109/cvpr.2019.01301
"""

# TODO: this shouldn't be needed, but .tif.ovr file is getting picked up
Expand Down Expand Up @@ -266,3 +269,205 @@ class ChesapeakeWV(Chesapeake):
filename = "WV_STATEWIDE.tif"
zipfile = "_WV_STATEWIDE.zip"
md5 = "350621ea293651fbc557a1c3e3c64cc3"


class ChesapeakeCVPR(GeoDataset):
"""CVPR 2019 Chesapeake Land Cover dataset.

The `CVPR 2019 Chesapeake Land Cover
<https://lila.science/datasets/chesapeakelandcover>`_ dataset contains two layers of
NAIP aerial imagery, Landsat 8 leaf-on and leaf-off imagery, Chesapeake Bay land
cover labels, NLCD land cover labels, and Microsoft building footprint labels.

This dataset was organized to accompany the 2019 CVPR paper, "Large Scale
High-Resolution Land Cover Mapping with Multi-Resolution Data".

If you use this dataset in your research, please cite the following paper:

* https://doi.org/10.1109/cvpr.2019.01301
"""

url = "https://lilablobssc.blob.core.windows.net/lcmcvpr2019/cvpr_chesapeake_landcover.zip" # noqa: E501
filename = "cvpr_chesapeake_landcover.zip"
md5 = "0ea5e7cb861be3fb8a06fedaaaf91af9"

valid_layers = [
"naip-new",
"naip-old",
"landsat-leaf-on",
"landsat-leaf-off",
"nlcd",
"lc",
"buildings",
]
states = ["de", "md", "va", "wv", "pa", "ny"]
splits = (
[f"{state}-train" for state in states]
+ [f"{state}-val" for state in states]
+ [f"{state}-test" for state in states]
)

p_src_crs = pyproj.CRS("epsg:3857")
p_transformers = {
"epsg:26917": pyproj.Transformer.from_crs(
p_src_crs, pyproj.CRS("epsg:26917"), always_xy=True
).transform,
"epsg:26918": pyproj.Transformer.from_crs(
p_src_crs, pyproj.CRS("epsg:26918"), always_xy=True
).transform,
}

def __init__(
self,
root: str = "data",
split: str = "de-train",
layers: List[str] = ["naip-new", "lc"],
transforms: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
cache: bool = True,
download: bool = False,
checksum: bool = False,
) -> None:
"""Initialize a new Dataset instance.

Args:
root: root directory where dataset can be found
split: a string in the format "{state}-{train,val,test}" indicating the
subset of data to use, for example "ny-train"
layers: a list containing a subset of "naip-new", "naip-old", "lc", "nlcd",
"landsat-leaf-on", "landsat-leaf-off", "buildings" indicating which
layers to load
transforms: a function/transform that takes an input sample
and returns a transformed version
cache: if True, cache file handle to speed up repeated sampling
download: if True, download dataset and store it in the root directory
checksum: if True, check the MD5 of the downloaded files (may be slow)

Raises:
FileNotFoundError: if no files are found in ``root``
RuntimeError: if ``download=False`` but dataset is missing or checksum fails
"""
assert split in self.splits
assert all([layer in self.valid_layers for layer in layers])
super().__init__(transforms) # creates self.index and self.transform
self.root = root
self.layers = layers
self.cache = cache
self.checksum = checksum

if download:
self._download()

if not self._check_integrity():
raise RuntimeError(
"Dataset not found or corrupted. "
+ "You can use download=True to download it"
)

# Add all tiles into the index in epsg:3857 based on the included geojson
mint: float = 0
maxt: float = sys.maxsize
with fiona.open(os.path.join(root, "spatial_index.geojson"), "r") as f:
for i, row in enumerate(f):
if row["properties"]["split"] == split:
box = shapely.geometry.shape(row["geometry"])
minx, miny, maxx, maxy = box.bounds
coords = (minx, maxx, miny, maxy, mint, maxt)
self.index.insert(
i,
coords,
{
"naip-new": row["properties"]["naip-new"],
"naip-old": row["properties"]["naip-old"],
"landsat-leaf-on": row["properties"]["landsat-leaf-on"],
"landsat-leaf-off": row["properties"]["landsat-leaf-off"],
"lc": row["properties"]["lc"],
"nlcd": row["properties"]["nlcd"],
"buildings": row["properties"]["buildings"],
},
)

def __getitem__(self, query: BoundingBox) -> Dict[str, Any]:
"""Retrieve image/mask and metadata indexed by query.

Args:
query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index

Returns:
sample of image/mask and metadata at that index

Raises:
IndexError: if query is not found in the index
"""
hits = self.index.intersection(query, objects=True)
filepaths = [hit.object for hit in hits]

sample = {
"crs": self.crs,
"bbox": query,
}

if len(filepaths) == 0:
raise IndexError(
f"query: {query} not found in index with bounds: {self.bounds}"
)
elif len(filepaths) == 1:
filenames = filepaths[0]
query_geom_transformed = None # is set by the first layer

minx, maxx, miny, maxy, mint, maxt = query
query_box = shapely.geometry.box(minx, miny, maxx, maxy)

for layer in self.layers:

fn = filenames[layer]

with rasterio.open(os.path.join(self.root, fn)) as f:
dst_crs = f.crs.to_string().lower()

if query_geom_transformed is None:
query_box_transformed = shapely.ops.transform(
self.p_transformers[dst_crs], query_box
).envelope
query_geom_transformed = shapely.geometry.mapping(
query_box_transformed
)

data, _ = rasterio.mask.mask(
f, [query_geom_transformed], crop=True, all_touched=True
)

sample[layer] = data.squeeze()

else:
raise IndexError(f"query: {query} spans multiple tiles which is not valid")

if self.transforms is not None:
sample = self.transforms(sample)

return sample

def _check_integrity(self) -> bool:
"""Check integrity of dataset.

Returns:
True if dataset files are found and/or MD5s match, else False
"""
integrity: bool = check_integrity(
os.path.join(self.root, self.filename),
self.md5 if self.checksum else None,
)

return integrity

def _download(self) -> None:
"""Download the dataset and extract it."""
if self._check_integrity():
print("Files already downloaded and verified")
return

download_and_extract_archive(
self.url,
self.root,
filename=self.filename,
md5=self.md5,
)