forked from microsoft/torchgeo
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add GBIF dataset * Typo fix * Add tests * Style fixes * Don't ignore CSV files * Testing... * Fix coverage bug * Add note about required dep
- Loading branch information
1 parent
ec085fb
commit c8431c6
Showing
7 changed files
with
301 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,6 @@ | |
/data/ | ||
/logs/ | ||
/output/ | ||
*.csv | ||
|
||
# Spack | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
gbifID datasetKey occurrenceID kingdom phylum class order family genus species infraspecificEpithet taxonRank scientificName verbatimScientificName verbatimScientificNameAuthorship countryCode locality stateProvince occurrenceStatus individualCount publishingOrgKey decimalLatitude decimalLongitude coordinateUncertaintyInMeters coordinatePrecision elevation elevationAccuracy depth depthAccuracy eventDate day month year taxonKey speciesKey basisOfRecord institutionCode collectionCode catalogNumber recordNumber identifiedBy dateIdentified license rightsHolder recordedBy typeStatus establishmentMeans lastInterpreted mediaType issue | ||
Animalia Chordata Mammalia Primates Hominidae Homo Homo sapiens SPECIES Homo sapiens Linnaeus, 1758 Homo sapiens Linnaeus, 1758 Linnaeus, 1758 US Chicago Illinois PRESENT 1 41.881832 5 16 4 2022 1 1 HUMAN_OBSERVATION | ||
Animalia Chordata Mammalia Primates Hominidae Homo Homo sapiens SPECIES Homo sapiens Linnaeus, 1758 Homo sapiens Linnaeus, 1758 Linnaeus, 1758 US Chicago Illinois PRESENT 1 41.881832 -87.623177 5 1 1 HUMAN_OBSERVATION | ||
Animalia Chordata Mammalia Primates Hominidae Homo Homo sapiens SPECIES Homo sapiens Linnaeus, 1758 Homo sapiens Linnaeus, 1758 Linnaeus, 1758 US Chicago Illinois PRESENT 1 41.881832 -87.623177 5 2022 1 1 HUMAN_OBSERVATION | ||
Animalia Chordata Mammalia Primates Hominidae Homo Homo sapiens SPECIES Homo sapiens Linnaeus, 1758 Homo sapiens Linnaeus, 1758 Linnaeus, 1758 US Chicago Illinois PRESENT 1 41.881832 -87.623177 5 12 2022 1 1 HUMAN_OBSERVATION | ||
Animalia Chordata Mammalia Primates Hominidae Homo Homo sapiens SPECIES Homo sapiens Linnaeus, 1758 Homo sapiens Linnaeus, 1758 Linnaeus, 1758 US Chicago Illinois PRESENT 1 41.881832 -87.623177 5 -450 4 2022 1 1 HUMAN_OBSERVATION | ||
Animalia Chordata Mammalia Primates Hominidae Homo Homo sapiens SPECIES Homo sapiens Linnaeus, 1758 Homo sapiens Linnaeus, 1758 Linnaeus, 1758 US Chicago Illinois PRESENT 1 41.881832 -87.623177 5 2022-04-16T10:13:35.123Z 16 4 2022 1 1 HUMAN_OBSERVATION |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# Licensed under the MIT License. | ||
|
||
import pandas as pd | ||
|
||
filename = "0123456-012345678901234.csv" | ||
|
||
size = 6 | ||
data = { | ||
"gbifID": [""] * size, | ||
"datasetKey": [""] * size, | ||
"occurrenceID": [""] * size, | ||
"kingdom": ["Animalia"] * size, | ||
"phylum": ["Chordata"] * size, | ||
"class": ["Mammalia"] * size, | ||
"order": ["Primates"] * size, | ||
"family": ["Hominidae"] * size, | ||
"genus": ["Homo"] * size, | ||
"species": ["Homo sapiens"] * size, | ||
"infraspecificEpithet": [""] * size, | ||
"taxonRank": ["SPECIES"] * size, | ||
"scientificName": ["Homo sapiens Linnaeus, 1758"] * size, | ||
"verbatimScientificName": ["Homo sapiens Linnaeus, 1758"] * size, | ||
"verbatimScientificNameAuthorship": ["Linnaeus, 1758"] * size, | ||
"countryCode": ["US"] * size, | ||
"locality": ["Chicago"] * size, | ||
"stateProvince": ["Illinois"] * size, | ||
"occurrenceStatus": ["PRESENT"] * size, | ||
"individualCount": [1] * size, | ||
"publishingOrgKey": [""] * size, | ||
"decimalLatitude": [41.881832] * size, | ||
"decimalLongitude": [""] + [-87.623177] * (size - 1), | ||
"coordinateUncertaintyInMeters": [5] * size, | ||
"coordinatePrecision": [""] * size, | ||
"elevation": [""] * size, | ||
"elevationAccuracy": [""] * size, | ||
"depth": [""] * size, | ||
"depthAccuracy": [""] * size, | ||
"eventDate": ["", "", "", "", -450, "2022-04-16T10:13:35.123Z"], | ||
"day": [16, "", "", "", "", 16], | ||
"month": [4, "", "", 12, 4, 4], | ||
"year": [2022, "", 2022, 2022, 2022, 2022], | ||
"taxonKey": [1] * size, | ||
"speciesKey": [1] * size, | ||
"basisOfRecord": ["HUMAN_OBSERVATION"] * size, | ||
"institutionCode": [""] * size, | ||
"collectionCode": [""] * size, | ||
"catalogNumber": [""] * size, | ||
"recordNumber": [""] * size, | ||
"identifiedBy": [""] * size, | ||
"dateIdentified": [""] * size, | ||
"license": [""] * size, | ||
"rightsHolder": [""] * size, | ||
"recordedBy": [""] * size, | ||
"typeStatus": [""] * size, | ||
"establishmentMeans": [""] * size, | ||
"lastInterpreted": [""] * size, | ||
"mediaType": [""] * size, | ||
"issue": [""] * size, | ||
} | ||
|
||
df = pd.DataFrame(data) | ||
df.to_csv(filename, sep="\t", index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# Licensed under the MIT License. | ||
|
||
import builtins | ||
import os | ||
from pathlib import Path | ||
from typing import Any | ||
|
||
import pytest | ||
from _pytest.monkeypatch import MonkeyPatch | ||
|
||
from torchgeo.datasets import GBIF, BoundingBox, IntersectionDataset, UnionDataset | ||
|
||
pytest.importorskip("pandas", minversion="0.23.2") | ||
|
||
|
||
class TestGBIF: | ||
@pytest.fixture(scope="class") | ||
def dataset(self) -> GBIF: | ||
root = os.path.join("tests", "data", "gbif") | ||
return GBIF(root) | ||
|
||
def test_getitem(self, dataset: GBIF) -> None: | ||
x = dataset[dataset.bounds] | ||
assert isinstance(x, dict) | ||
|
||
def test_len(self, dataset: GBIF) -> None: | ||
assert len(dataset) == 5 | ||
|
||
def test_and(self, dataset: GBIF) -> None: | ||
ds = dataset & dataset | ||
assert isinstance(ds, IntersectionDataset) | ||
|
||
def test_or(self, dataset: GBIF) -> None: | ||
ds = dataset | dataset | ||
assert isinstance(ds, UnionDataset) | ||
|
||
def test_no_data(self, tmp_path: Path) -> None: | ||
with pytest.raises(FileNotFoundError, match="Dataset not found"): | ||
GBIF(str(tmp_path)) | ||
|
||
@pytest.fixture | ||
def mock_missing_module(self, monkeypatch: MonkeyPatch) -> None: | ||
import_orig = builtins.__import__ | ||
|
||
def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any: | ||
if name == "pandas": | ||
raise ImportError() | ||
return import_orig(name, *args, **kwargs) | ||
|
||
monkeypatch.setattr(builtins, "__import__", mocked_import) | ||
|
||
def test_mock_missing_module( | ||
self, dataset: GBIF, mock_missing_module: None | ||
) -> None: | ||
with pytest.raises( | ||
ImportError, | ||
match="pandas is not installed and is required to use this dataset", | ||
): | ||
GBIF(dataset.root) | ||
|
||
def test_invalid_query(self, dataset: GBIF) -> None: | ||
query = BoundingBox(0, 0, 0, 0, 0, 0) | ||
with pytest.raises( | ||
IndexError, match="query: .* not found in index with bounds:" | ||
): | ||
dataset[query] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# Licensed under the MIT License. | ||
|
||
"""Dataset for the Global Biodiversity Information Facility.""" | ||
|
||
import glob | ||
import os | ||
import sys | ||
from datetime import datetime, timedelta | ||
from typing import Any, Dict, Tuple | ||
|
||
import numpy as np | ||
from rasterio.crs import CRS | ||
|
||
from .geo import GeoDataset | ||
from .utils import BoundingBox | ||
|
||
|
||
def _disambiguate_timestamps( | ||
year: float, month: float, day: float | ||
) -> Tuple[float, float]: | ||
"""Disambiguate partial timestamps. | ||
Based on :func:`torchgeo.datasets.utils.disambiguate_timestamps`. | ||
Args: | ||
year: year, possibly nan | ||
month: month, possibly nan | ||
day: day, possibly nan | ||
Returns: | ||
minimum and maximum possible time range | ||
""" | ||
if np.isnan(year): | ||
# No temporal info | ||
return 0, sys.maxsize | ||
elif np.isnan(month): | ||
# Year resolution | ||
mint = datetime(int(year), 1, 1) | ||
maxt = datetime(int(year) + 1, 1, 1) | ||
elif np.isnan(day): | ||
# Month resolution | ||
mint = datetime(int(year), int(month), 1) | ||
if month == 12: | ||
maxt = datetime(int(year) + 1, 1, 1) | ||
else: | ||
maxt = datetime(int(year), int(month) + 1, 1) | ||
else: | ||
# Day resolution | ||
mint = datetime(int(year), int(month), int(day)) | ||
maxt = mint + timedelta(days=1) | ||
|
||
maxt -= timedelta(microseconds=1) | ||
|
||
return mint.timestamp(), maxt.timestamp() | ||
|
||
|
||
class GBIF(GeoDataset): | ||
"""Dataset for the Global Biodiversity Information Facility. | ||
`GBIF <https://www.gbif.org/>`_, the Global Biodiversity Information Facility, | ||
is an international network and data infrastructure funded by the world's | ||
governments and aimed at providing anyone, anywhere, open access to data about | ||
all types of life on Earth. | ||
This dataset is intended for use with GBIF's | ||
`occurrence records <https://www.gbif.org/dataset/search>`_. It may or may not work | ||
for other GBIF `datasets <https://www.gbif.org/dataset/search>`_. Data for a | ||
particular species or region of interest can be downloaded from the above link. | ||
If you use a GBIF dataset in your research, please cite it according to: | ||
* https://www.gbif.org/citation-guidelines | ||
.. note:: | ||
This dataset requires the following additional library to be installed: | ||
* `pandas <https://pypi.org/project/pandas/>`_ to load CSV files | ||
.. versionadded:: 0.3 | ||
""" | ||
|
||
res = 0 | ||
_crs = CRS.from_epsg(4326) # Lat/Lon | ||
|
||
def __init__(self, root: str = "data") -> None: | ||
"""Initialize a new Dataset instance. | ||
Args: | ||
root: root directory where dataset can be found | ||
Raises: | ||
FileNotFoundError: if no files are found in ``root`` | ||
ImportError: if pandas is not installed | ||
""" | ||
super().__init__() | ||
|
||
self.root = root | ||
|
||
files = glob.glob(os.path.join(root, "**.csv")) | ||
if not files: | ||
raise FileNotFoundError(f"Dataset not found in `root={self.root}`") | ||
|
||
try: | ||
import pandas as pd # noqa: F401 | ||
except ImportError: | ||
raise ImportError( | ||
"pandas is not installed and is required to use this dataset" | ||
) | ||
|
||
# Read tab-delimited CSV file | ||
data = pd.read_table( | ||
files[0], | ||
engine="c", | ||
usecols=["decimalLatitude", "decimalLongitude", "day", "month", "year"], | ||
) | ||
|
||
# Convert from pandas DataFrame to rtree Index | ||
i = 0 | ||
for y, x, day, month, year in data.itertuples(index=False, name=None): | ||
# Skip rows without lat/lon | ||
if np.isnan(y) or np.isnan(x): | ||
continue | ||
|
||
mint, maxt = _disambiguate_timestamps(year, month, day) | ||
|
||
coords = (x, x, y, y, mint, maxt) | ||
self.index.insert(i, coords) | ||
i += 1 | ||
|
||
def __getitem__(self, query: BoundingBox) -> Dict[str, Any]: | ||
"""Retrieve metadata indexed by query. | ||
Args: | ||
query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index | ||
Returns: | ||
sample of metadata at that index | ||
Raises: | ||
IndexError: if query is not found in the index | ||
""" | ||
hits = self.index.intersection(tuple(query), objects=True) | ||
bboxes = [hit.bbox for hit in hits] | ||
|
||
if not bboxes: | ||
raise IndexError( | ||
f"query: {query} not found in index with bounds: {self.bounds}" | ||
) | ||
|
||
sample = {"crs": self.crs, "bbox": bboxes} | ||
|
||
return sample |