Skip to content

Commit

Permalink
Add GBIF dataset (microsoft#507)
Browse files Browse the repository at this point in the history
* Add GBIF dataset

* Typo fix

* Add tests

* Style fixes

* Don't ignore CSV files

* Testing...

* Fix coverage bug

* Add note about required dep
  • Loading branch information
adamjstewart authored May 6, 2022
1 parent ec085fb commit c8431c6
Show file tree
Hide file tree
Showing 7 changed files with 301 additions and 3 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
/data/
/logs/
/output/
*.csv
*.pdf

# Spack
Expand Down
9 changes: 7 additions & 2 deletions docs/api/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ EU-DEM

.. autoclass:: EUDEM

GBIF
^^^^

.. autoclass:: GBIF

GlobBiomass
^^^^^^^^^^^

Expand Down Expand Up @@ -96,7 +101,7 @@ Open Buildings
^^^^^^^^^^^^^^

.. autoclass:: OpenBuildings

Sentinel
^^^^^^^^

Expand Down Expand Up @@ -238,7 +243,7 @@ SpaceNet
.. autoclass:: SpaceNet
.. autoclass:: SpaceNet1
.. autoclass:: SpaceNet2
.. autoclass:: SpaceNet3
.. autoclass:: SpaceNet3
.. autoclass:: SpaceNet4
.. autoclass:: SpaceNet5
.. autoclass:: SpaceNet7
Expand Down
7 changes: 7 additions & 0 deletions tests/data/gbif/0123456-012345678901234.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
gbifID datasetKey occurrenceID kingdom phylum class order family genus species infraspecificEpithet taxonRank scientificName verbatimScientificName verbatimScientificNameAuthorship countryCode locality stateProvince occurrenceStatus individualCount publishingOrgKey decimalLatitude decimalLongitude coordinateUncertaintyInMeters coordinatePrecision elevation elevationAccuracy depth depthAccuracy eventDate day month year taxonKey speciesKey basisOfRecord institutionCode collectionCode catalogNumber recordNumber identifiedBy dateIdentified license rightsHolder recordedBy typeStatus establishmentMeans lastInterpreted mediaType issue
Animalia Chordata Mammalia Primates Hominidae Homo Homo sapiens SPECIES Homo sapiens Linnaeus, 1758 Homo sapiens Linnaeus, 1758 Linnaeus, 1758 US Chicago Illinois PRESENT 1 41.881832 5 16 4 2022 1 1 HUMAN_OBSERVATION
Animalia Chordata Mammalia Primates Hominidae Homo Homo sapiens SPECIES Homo sapiens Linnaeus, 1758 Homo sapiens Linnaeus, 1758 Linnaeus, 1758 US Chicago Illinois PRESENT 1 41.881832 -87.623177 5 1 1 HUMAN_OBSERVATION
Animalia Chordata Mammalia Primates Hominidae Homo Homo sapiens SPECIES Homo sapiens Linnaeus, 1758 Homo sapiens Linnaeus, 1758 Linnaeus, 1758 US Chicago Illinois PRESENT 1 41.881832 -87.623177 5 2022 1 1 HUMAN_OBSERVATION
Animalia Chordata Mammalia Primates Hominidae Homo Homo sapiens SPECIES Homo sapiens Linnaeus, 1758 Homo sapiens Linnaeus, 1758 Linnaeus, 1758 US Chicago Illinois PRESENT 1 41.881832 -87.623177 5 12 2022 1 1 HUMAN_OBSERVATION
Animalia Chordata Mammalia Primates Hominidae Homo Homo sapiens SPECIES Homo sapiens Linnaeus, 1758 Homo sapiens Linnaeus, 1758 Linnaeus, 1758 US Chicago Illinois PRESENT 1 41.881832 -87.623177 5 -450 4 2022 1 1 HUMAN_OBSERVATION
Animalia Chordata Mammalia Primates Hominidae Homo Homo sapiens SPECIES Homo sapiens Linnaeus, 1758 Homo sapiens Linnaeus, 1758 Linnaeus, 1758 US Chicago Illinois PRESENT 1 41.881832 -87.623177 5 2022-04-16T10:13:35.123Z 16 4 2022 1 1 HUMAN_OBSERVATION
65 changes: 65 additions & 0 deletions tests/data/gbif/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env python3

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import pandas as pd

filename = "0123456-012345678901234.csv"

size = 6
data = {
"gbifID": [""] * size,
"datasetKey": [""] * size,
"occurrenceID": [""] * size,
"kingdom": ["Animalia"] * size,
"phylum": ["Chordata"] * size,
"class": ["Mammalia"] * size,
"order": ["Primates"] * size,
"family": ["Hominidae"] * size,
"genus": ["Homo"] * size,
"species": ["Homo sapiens"] * size,
"infraspecificEpithet": [""] * size,
"taxonRank": ["SPECIES"] * size,
"scientificName": ["Homo sapiens Linnaeus, 1758"] * size,
"verbatimScientificName": ["Homo sapiens Linnaeus, 1758"] * size,
"verbatimScientificNameAuthorship": ["Linnaeus, 1758"] * size,
"countryCode": ["US"] * size,
"locality": ["Chicago"] * size,
"stateProvince": ["Illinois"] * size,
"occurrenceStatus": ["PRESENT"] * size,
"individualCount": [1] * size,
"publishingOrgKey": [""] * size,
"decimalLatitude": [41.881832] * size,
"decimalLongitude": [""] + [-87.623177] * (size - 1),
"coordinateUncertaintyInMeters": [5] * size,
"coordinatePrecision": [""] * size,
"elevation": [""] * size,
"elevationAccuracy": [""] * size,
"depth": [""] * size,
"depthAccuracy": [""] * size,
"eventDate": ["", "", "", "", -450, "2022-04-16T10:13:35.123Z"],
"day": [16, "", "", "", "", 16],
"month": [4, "", "", 12, 4, 4],
"year": [2022, "", 2022, 2022, 2022, 2022],
"taxonKey": [1] * size,
"speciesKey": [1] * size,
"basisOfRecord": ["HUMAN_OBSERVATION"] * size,
"institutionCode": [""] * size,
"collectionCode": [""] * size,
"catalogNumber": [""] * size,
"recordNumber": [""] * size,
"identifiedBy": [""] * size,
"dateIdentified": [""] * size,
"license": [""] * size,
"rightsHolder": [""] * size,
"recordedBy": [""] * size,
"typeStatus": [""] * size,
"establishmentMeans": [""] * size,
"lastInterpreted": [""] * size,
"mediaType": [""] * size,
"issue": [""] * size,
}

df = pd.DataFrame(data)
df.to_csv(filename, sep="\t", index=False)
67 changes: 67 additions & 0 deletions tests/datasets/test_gbif.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import builtins
import os
from pathlib import Path
from typing import Any

import pytest
from _pytest.monkeypatch import MonkeyPatch

from torchgeo.datasets import GBIF, BoundingBox, IntersectionDataset, UnionDataset

pytest.importorskip("pandas", minversion="0.23.2")


class TestGBIF:
@pytest.fixture(scope="class")
def dataset(self) -> GBIF:
root = os.path.join("tests", "data", "gbif")
return GBIF(root)

def test_getitem(self, dataset: GBIF) -> None:
x = dataset[dataset.bounds]
assert isinstance(x, dict)

def test_len(self, dataset: GBIF) -> None:
assert len(dataset) == 5

def test_and(self, dataset: GBIF) -> None:
ds = dataset & dataset
assert isinstance(ds, IntersectionDataset)

def test_or(self, dataset: GBIF) -> None:
ds = dataset | dataset
assert isinstance(ds, UnionDataset)

def test_no_data(self, tmp_path: Path) -> None:
with pytest.raises(FileNotFoundError, match="Dataset not found"):
GBIF(str(tmp_path))

@pytest.fixture
def mock_missing_module(self, monkeypatch: MonkeyPatch) -> None:
import_orig = builtins.__import__

def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any:
if name == "pandas":
raise ImportError()
return import_orig(name, *args, **kwargs)

monkeypatch.setattr(builtins, "__import__", mocked_import)

def test_mock_missing_module(
self, dataset: GBIF, mock_missing_module: None
) -> None:
with pytest.raises(
ImportError,
match="pandas is not installed and is required to use this dataset",
):
GBIF(dataset.root)

def test_invalid_query(self, dataset: GBIF) -> None:
query = BoundingBox(0, 0, 0, 0, 0, 0)
with pytest.raises(
IndexError, match="query: .* not found in index with bounds:"
):
dataset[query]
2 changes: 2 additions & 0 deletions torchgeo/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from .eurosat import EuroSAT
from .fair1m import FAIR1M
from .forestdamage import ForestDamage
from .gbif import GBIF
from .geo import (
GeoDataset,
IntersectionDataset,
Expand Down Expand Up @@ -118,6 +119,7 @@
"CMSGlobalMangroveCanopy",
"Esri2020",
"EUDEM",
"GBIF",
"GlobBiomass",
"Landsat",
"Landsat1",
Expand Down
153 changes: 153 additions & 0 deletions torchgeo/datasets/gbif.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

"""Dataset for the Global Biodiversity Information Facility."""

import glob
import os
import sys
from datetime import datetime, timedelta
from typing import Any, Dict, Tuple

import numpy as np
from rasterio.crs import CRS

from .geo import GeoDataset
from .utils import BoundingBox


def _disambiguate_timestamps(
year: float, month: float, day: float
) -> Tuple[float, float]:
"""Disambiguate partial timestamps.
Based on :func:`torchgeo.datasets.utils.disambiguate_timestamps`.
Args:
year: year, possibly nan
month: month, possibly nan
day: day, possibly nan
Returns:
minimum and maximum possible time range
"""
if np.isnan(year):
# No temporal info
return 0, sys.maxsize
elif np.isnan(month):
# Year resolution
mint = datetime(int(year), 1, 1)
maxt = datetime(int(year) + 1, 1, 1)
elif np.isnan(day):
# Month resolution
mint = datetime(int(year), int(month), 1)
if month == 12:
maxt = datetime(int(year) + 1, 1, 1)
else:
maxt = datetime(int(year), int(month) + 1, 1)
else:
# Day resolution
mint = datetime(int(year), int(month), int(day))
maxt = mint + timedelta(days=1)

maxt -= timedelta(microseconds=1)

return mint.timestamp(), maxt.timestamp()


class GBIF(GeoDataset):
"""Dataset for the Global Biodiversity Information Facility.
`GBIF <https://www.gbif.org/>`_, the Global Biodiversity Information Facility,
is an international network and data infrastructure funded by the world's
governments and aimed at providing anyone, anywhere, open access to data about
all types of life on Earth.
This dataset is intended for use with GBIF's
`occurrence records <https://www.gbif.org/dataset/search>`_. It may or may not work
for other GBIF `datasets <https://www.gbif.org/dataset/search>`_. Data for a
particular species or region of interest can be downloaded from the above link.
If you use a GBIF dataset in your research, please cite it according to:
* https://www.gbif.org/citation-guidelines
.. note::
This dataset requires the following additional library to be installed:
* `pandas <https://pypi.org/project/pandas/>`_ to load CSV files
.. versionadded:: 0.3
"""

res = 0
_crs = CRS.from_epsg(4326) # Lat/Lon

def __init__(self, root: str = "data") -> None:
"""Initialize a new Dataset instance.
Args:
root: root directory where dataset can be found
Raises:
FileNotFoundError: if no files are found in ``root``
ImportError: if pandas is not installed
"""
super().__init__()

self.root = root

files = glob.glob(os.path.join(root, "**.csv"))
if not files:
raise FileNotFoundError(f"Dataset not found in `root={self.root}`")

try:
import pandas as pd # noqa: F401
except ImportError:
raise ImportError(
"pandas is not installed and is required to use this dataset"
)

# Read tab-delimited CSV file
data = pd.read_table(
files[0],
engine="c",
usecols=["decimalLatitude", "decimalLongitude", "day", "month", "year"],
)

# Convert from pandas DataFrame to rtree Index
i = 0
for y, x, day, month, year in data.itertuples(index=False, name=None):
# Skip rows without lat/lon
if np.isnan(y) or np.isnan(x):
continue

mint, maxt = _disambiguate_timestamps(year, month, day)

coords = (x, x, y, y, mint, maxt)
self.index.insert(i, coords)
i += 1

def __getitem__(self, query: BoundingBox) -> Dict[str, Any]:
"""Retrieve metadata indexed by query.
Args:
query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index
Returns:
sample of metadata at that index
Raises:
IndexError: if query is not found in the index
"""
hits = self.index.intersection(tuple(query), objects=True)
bboxes = [hit.bbox for hit in hits]

if not bboxes:
raise IndexError(
f"query: {query} not found in index with bounds: {self.bounds}"
)

sample = {"crs": self.crs, "bbox": bboxes}

return sample

0 comments on commit c8431c6

Please sign in to comment.