Add GBIF dataset (microsoft#507)

* Add GBIF dataset * Typo fix * Add tests * Style fixes * Don't ignore CSV files * Testing... * Fix coverage bug * Add note about required dep
yichiac · May 6, 2022 · c8431c6 · c8431c6
1 parent ec085fb
commit c8431c6
Show file tree

Hide file tree

Showing 7 changed files with 301 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,6 @@
 /data/
 /logs/
 /output/
-*.csv
 *.pdf
 
 # Spack

diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst
@@ -67,6 +67,11 @@ EU-DEM
 
 .. autoclass:: EUDEM
 
+GBIF
+^^^^
+
+.. autoclass:: GBIF
+
 GlobBiomass
 ^^^^^^^^^^^
 
@@ -96,7 +101,7 @@ Open Buildings
 ^^^^^^^^^^^^^^
 
 .. autoclass:: OpenBuildings
-    
+
 Sentinel
 ^^^^^^^^
 
@@ -238,7 +243,7 @@ SpaceNet
 .. autoclass:: SpaceNet
 .. autoclass:: SpaceNet1
 .. autoclass:: SpaceNet2
-.. autoclass:: SpaceNet3    
+.. autoclass:: SpaceNet3
 .. autoclass:: SpaceNet4
 .. autoclass:: SpaceNet5
 .. autoclass:: SpaceNet7

diff --git a/tests/data/gbif/0123456-012345678901234.csv b/tests/data/gbif/0123456-012345678901234.csv
@@ -0,0 +1,7 @@
+gbifID	datasetKey	occurrenceID	kingdom	phylum	class	order	family	genus	species	infraspecificEpithet	taxonRank	scientificName	verbatimScientificName	verbatimScientificNameAuthorship	countryCode	locality	stateProvince	occurrenceStatus	individualCount	publishingOrgKey	decimalLatitude	decimalLongitude	coordinateUncertaintyInMeters	coordinatePrecision	elevation	elevationAccuracy	depth	depthAccuracy	eventDate	day	month	year	taxonKey	speciesKey	basisOfRecord	institutionCode	collectionCode	catalogNumber	recordNumber	identifiedBy	dateIdentified	license	rightsHolder	recordedBy	typeStatus	establishmentMeans	lastInterpreted	mediaType	issue
+			Animalia	Chordata	Mammalia	Primates	Hominidae	Homo	Homo sapiens		SPECIES	Homo sapiens Linnaeus, 1758	Homo sapiens Linnaeus, 1758	Linnaeus, 1758	US	Chicago	Illinois	PRESENT	1		41.881832		5							16	4	2022	1	1	HUMAN_OBSERVATION														
+			Animalia	Chordata	Mammalia	Primates	Hominidae	Homo	Homo sapiens		SPECIES	Homo sapiens Linnaeus, 1758	Homo sapiens Linnaeus, 1758	Linnaeus, 1758	US	Chicago	Illinois	PRESENT	1		41.881832	-87.623177	5										1	1	HUMAN_OBSERVATION														
+			Animalia	Chordata	Mammalia	Primates	Hominidae	Homo	Homo sapiens		SPECIES	Homo sapiens Linnaeus, 1758	Homo sapiens Linnaeus, 1758	Linnaeus, 1758	US	Chicago	Illinois	PRESENT	1		41.881832	-87.623177	5									2022	1	1	HUMAN_OBSERVATION														
+			Animalia	Chordata	Mammalia	Primates	Hominidae	Homo	Homo sapiens		SPECIES	Homo sapiens Linnaeus, 1758	Homo sapiens Linnaeus, 1758	Linnaeus, 1758	US	Chicago	Illinois	PRESENT	1		41.881832	-87.623177	5								12	2022	1	1	HUMAN_OBSERVATION														
+			Animalia	Chordata	Mammalia	Primates	Hominidae	Homo	Homo sapiens		SPECIES	Homo sapiens Linnaeus, 1758	Homo sapiens Linnaeus, 1758	Linnaeus, 1758	US	Chicago	Illinois	PRESENT	1		41.881832	-87.623177	5						-450		4	2022	1	1	HUMAN_OBSERVATION														
+			Animalia	Chordata	Mammalia	Primates	Hominidae	Homo	Homo sapiens		SPECIES	Homo sapiens Linnaeus, 1758	Homo sapiens Linnaeus, 1758	Linnaeus, 1758	US	Chicago	Illinois	PRESENT	1		41.881832	-87.623177	5						2022-04-16T10:13:35.123Z	16	4	2022	1	1	HUMAN_OBSERVATION														
diff --git a/tests/data/gbif/data.py b/tests/data/gbif/data.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import pandas as pd
+
+filename = "0123456-012345678901234.csv"
+
+size = 6
+data = {
+    "gbifID": [""] * size,
+    "datasetKey": [""] * size,
+    "occurrenceID": [""] * size,
+    "kingdom": ["Animalia"] * size,
+    "phylum": ["Chordata"] * size,
+    "class": ["Mammalia"] * size,
+    "order": ["Primates"] * size,
+    "family": ["Hominidae"] * size,
+    "genus": ["Homo"] * size,
+    "species": ["Homo sapiens"] * size,
+    "infraspecificEpithet": [""] * size,
+    "taxonRank": ["SPECIES"] * size,
+    "scientificName": ["Homo sapiens Linnaeus, 1758"] * size,
+    "verbatimScientificName": ["Homo sapiens Linnaeus, 1758"] * size,
+    "verbatimScientificNameAuthorship": ["Linnaeus, 1758"] * size,
+    "countryCode": ["US"] * size,
+    "locality": ["Chicago"] * size,
+    "stateProvince": ["Illinois"] * size,
+    "occurrenceStatus": ["PRESENT"] * size,
+    "individualCount": [1] * size,
+    "publishingOrgKey": [""] * size,
+    "decimalLatitude": [41.881832] * size,
+    "decimalLongitude": [""] + [-87.623177] * (size - 1),
+    "coordinateUncertaintyInMeters": [5] * size,
+    "coordinatePrecision": [""] * size,
+    "elevation": [""] * size,
+    "elevationAccuracy": [""] * size,
+    "depth": [""] * size,
+    "depthAccuracy": [""] * size,
+    "eventDate": ["", "", "", "", -450, "2022-04-16T10:13:35.123Z"],
+    "day": [16, "", "", "", "", 16],
+    "month": [4, "", "", 12, 4, 4],
+    "year": [2022, "", 2022, 2022, 2022, 2022],
+    "taxonKey": [1] * size,
+    "speciesKey": [1] * size,
+    "basisOfRecord": ["HUMAN_OBSERVATION"] * size,
+    "institutionCode": [""] * size,
+    "collectionCode": [""] * size,
+    "catalogNumber": [""] * size,
+    "recordNumber": [""] * size,
+    "identifiedBy": [""] * size,
+    "dateIdentified": [""] * size,
+    "license": [""] * size,
+    "rightsHolder": [""] * size,
+    "recordedBy": [""] * size,
+    "typeStatus": [""] * size,
+    "establishmentMeans": [""] * size,
+    "lastInterpreted": [""] * size,
+    "mediaType": [""] * size,
+    "issue": [""] * size,
+}
+
+df = pd.DataFrame(data)
+df.to_csv(filename, sep="\t", index=False)
diff --git a/tests/datasets/test_gbif.py b/tests/datasets/test_gbif.py
@@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import builtins
+import os
+from pathlib import Path
+from typing import Any
+
+import pytest
+from _pytest.monkeypatch import MonkeyPatch
+
+from torchgeo.datasets import GBIF, BoundingBox, IntersectionDataset, UnionDataset
+
+pytest.importorskip("pandas", minversion="0.23.2")
+
+
+class TestGBIF:
+    @pytest.fixture(scope="class")
+    def dataset(self) -> GBIF:
+        root = os.path.join("tests", "data", "gbif")
+        return GBIF(root)
+
+    def test_getitem(self, dataset: GBIF) -> None:
+        x = dataset[dataset.bounds]
+        assert isinstance(x, dict)
+
+    def test_len(self, dataset: GBIF) -> None:
+        assert len(dataset) == 5
+
+    def test_and(self, dataset: GBIF) -> None:
+        ds = dataset & dataset
+        assert isinstance(ds, IntersectionDataset)
+
+    def test_or(self, dataset: GBIF) -> None:
+        ds = dataset | dataset
+        assert isinstance(ds, UnionDataset)
+
+    def test_no_data(self, tmp_path: Path) -> None:
+        with pytest.raises(FileNotFoundError, match="Dataset not found"):
+            GBIF(str(tmp_path))
+
+    @pytest.fixture
+    def mock_missing_module(self, monkeypatch: MonkeyPatch) -> None:
+        import_orig = builtins.__import__
+
+        def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any:
+            if name == "pandas":
+                raise ImportError()
+            return import_orig(name, *args, **kwargs)
+
+        monkeypatch.setattr(builtins, "__import__", mocked_import)
+
+    def test_mock_missing_module(
+        self, dataset: GBIF, mock_missing_module: None
+    ) -> None:
+        with pytest.raises(
+            ImportError,
+            match="pandas is not installed and is required to use this dataset",
+        ):
+            GBIF(dataset.root)
+
+    def test_invalid_query(self, dataset: GBIF) -> None:
+        query = BoundingBox(0, 0, 0, 0, 0, 0)
+        with pytest.raises(
+            IndexError, match="query: .* not found in index with bounds:"
+        ):
+            dataset[query]
diff --git a/torchgeo/datasets/__init__.py b/torchgeo/datasets/__init__.py
@@ -35,6 +35,7 @@
 from .eurosat import EuroSAT
 from .fair1m import FAIR1M
 from .forestdamage import ForestDamage
+from .gbif import GBIF
 from .geo import (
     GeoDataset,
     IntersectionDataset,
@@ -118,6 +119,7 @@
     "CMSGlobalMangroveCanopy",
     "Esri2020",
     "EUDEM",
+    "GBIF",
     "GlobBiomass",
     "Landsat",
     "Landsat1",

diff --git a/torchgeo/datasets/gbif.py b/torchgeo/datasets/gbif.py
@@ -0,0 +1,153 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Dataset for the Global Biodiversity Information Facility."""
+
+import glob
+import os
+import sys
+from datetime import datetime, timedelta
+from typing import Any, Dict, Tuple
+
+import numpy as np
+from rasterio.crs import CRS
+
+from .geo import GeoDataset
+from .utils import BoundingBox
+
+
+def _disambiguate_timestamps(
+    year: float, month: float, day: float
+) -> Tuple[float, float]:
+    """Disambiguate partial timestamps.
+
+    Based on :func:`torchgeo.datasets.utils.disambiguate_timestamps`.
+
+    Args:
+        year: year, possibly nan
+        month: month, possibly nan
+        day: day, possibly nan
+
+    Returns:
+        minimum and maximum possible time range
+    """
+    if np.isnan(year):
+        # No temporal info
+        return 0, sys.maxsize
+    elif np.isnan(month):
+        # Year resolution
+        mint = datetime(int(year), 1, 1)
+        maxt = datetime(int(year) + 1, 1, 1)
+    elif np.isnan(day):
+        # Month resolution
+        mint = datetime(int(year), int(month), 1)
+        if month == 12:
+            maxt = datetime(int(year) + 1, 1, 1)
+        else:
+            maxt = datetime(int(year), int(month) + 1, 1)
+    else:
+        # Day resolution
+        mint = datetime(int(year), int(month), int(day))
+        maxt = mint + timedelta(days=1)
+
+    maxt -= timedelta(microseconds=1)
+
+    return mint.timestamp(), maxt.timestamp()
+
+
+class GBIF(GeoDataset):
+    """Dataset for the Global Biodiversity Information Facility.
+
+    `GBIF <https://www.gbif.org/>`_, the Global Biodiversity Information Facility,
+    is an international network and data infrastructure funded by the world's
+    governments and aimed at providing anyone, anywhere, open access to data about
+    all types of life on Earth.
+
+    This dataset is intended for use with GBIF's
+    `occurrence records <https://www.gbif.org/dataset/search>`_. It may or may not work
+    for other GBIF `datasets <https://www.gbif.org/dataset/search>`_. Data for a
+    particular species or region of interest can be downloaded from the above link.
+
+    If you use a GBIF dataset in your research, please cite it according to:
+
+    * https://www.gbif.org/citation-guidelines
+
+    .. note::
+       This dataset requires the following additional library to be installed:
+
+       * `pandas <https://pypi.org/project/pandas/>`_ to load CSV files
+
+    .. versionadded:: 0.3
+    """
+
+    res = 0
+    _crs = CRS.from_epsg(4326)  # Lat/Lon
+
+    def __init__(self, root: str = "data") -> None:
+        """Initialize a new Dataset instance.
+
+        Args:
+            root: root directory where dataset can be found
+
+        Raises:
+            FileNotFoundError: if no files are found in ``root``
+            ImportError: if pandas is not installed
+        """
+        super().__init__()
+
+        self.root = root
+
+        files = glob.glob(os.path.join(root, "**.csv"))
+        if not files:
+            raise FileNotFoundError(f"Dataset not found in `root={self.root}`")
+
+        try:
+            import pandas as pd  # noqa: F401
+        except ImportError:
+            raise ImportError(
+                "pandas is not installed and is required to use this dataset"
+            )
+
+        # Read tab-delimited CSV file
+        data = pd.read_table(
+            files[0],
+            engine="c",
+            usecols=["decimalLatitude", "decimalLongitude", "day", "month", "year"],
+        )
+
+        # Convert from pandas DataFrame to rtree Index
+        i = 0
+        for y, x, day, month, year in data.itertuples(index=False, name=None):
+            # Skip rows without lat/lon
+            if np.isnan(y) or np.isnan(x):
+                continue
+
+            mint, maxt = _disambiguate_timestamps(year, month, day)
+
+            coords = (x, x, y, y, mint, maxt)
+            self.index.insert(i, coords)
+            i += 1
+
+    def __getitem__(self, query: BoundingBox) -> Dict[str, Any]:
+        """Retrieve metadata indexed by query.
+
+        Args:
+            query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index
+
+        Returns:
+            sample of metadata at that index
+
+        Raises:
+            IndexError: if query is not found in the index
+        """
+        hits = self.index.intersection(tuple(query), objects=True)
+        bboxes = [hit.bbox for hit in hits]
+
+        if not bboxes:
+            raise IndexError(
+                f"query: {query} not found in index with bounds: {self.bounds}"
+            )
+
+        sample = {"crs": self.crs, "bbox": bboxes}
+
+        return sample
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,6 @@ @@
     /data/
     /logs/
     /output/
-    *.csv
     *.pdf
     # Spack
@@ Expand Down @@