From aede4f4df27c0bdc95e5f5b2202eb4f6a5c73b5f Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Tue, 5 Sep 2023 20:01:55 -0700 Subject: [PATCH 01/11] Add support for mask parameter to read, read_bounds, read_dataframe, open_arrow --- CHANGES.md | 4 +- docs/source/introduction.md | 27 ++++++++ pyogrio/_io.pyx | 57 +++++++++++++++-- pyogrio/_ogr.pxd | 2 + pyogrio/core.py | 12 +++- pyogrio/geopandas.py | 11 ++++ pyogrio/raw.py | 22 ++++++- pyogrio/tests/test_core.py | 72 ++++++++++++++++++++++ pyogrio/tests/test_geopandas_io.py | 98 +++++++++++++++++++++++++++++- pyogrio/tests/test_raw_io.py | 63 +++++++++++++++++++ pyogrio/util.py | 39 ++++++++++++ 11 files changed, 397 insertions(+), 10 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 50539517..97818bd6 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -11,9 +11,11 @@ unknown count (count is used in `read_info`, `read`, `read_dataframe`) (#271). - Raise error if `read` or `read_dataframe` is called with parameters to read no columns, geometry, or fids (#280) - - Automatically detect supported driver by extension for all available write drivers and addition of `detect_write_driver` (#270) +- Addition of `mask` parameter to `open_arrow`, `read`, `read_dataframe`, + and `read_bounds` functions to select only the features in the dataset that + intersect the mask geometry. ### Bug fixes diff --git a/docs/source/introduction.md b/docs/source/introduction.md index 8f8ba2b6..f722e3aa 100644 --- a/docs/source/introduction.md +++ b/docs/source/introduction.md @@ -198,6 +198,32 @@ will be returned; if GEOS is not available or not used by GDAL, all geometries with bounding boxes that intersect this bbox will be returned. `pyogrio.__gdal_geos_version__` will be `None` if GEOS is not detected. +## Filter records by a geometry + +You can use the `mask` parameter to select only those features that intersect +with a Shapely (>= 2.0) geometry. + +```python +>>> mask = shapely.Polygon(([-80,8], [-80, 10], [-85,10], [-85,8], [-80,8])) +>>> read_dataframe('ne_10m_admin_0_countries.shp', mask) +``` + +Note: the `mask` values must be in the same CRS as the dataset. + +If your mask geometry is in some other representation, such as GeoJSON, you will +need to convert it to a Shapely geometry before using `mask`. + +```python +>>> mask_geojson = '{"type":"Polygon","coordinates":[[[-80.0,8.0],[-80.0,10.0],[-85.0,10.0],[-85.0,8.0],[-80.0,8.0]]]}' +>>> mask = shapely.from_geojson(mask_geojson) +>>> read_dataframe('ne_10m_admin_0_countries.shp', mask) +``` + +Note: if GEOS is present and used by GDAL, only geometries that intersect `mask` +will be returned; if GEOS is not available or not used by GDAL, all geometries +with bounding boxes that intersect the bounding box of `mask` will be returned. +`pyogrio.__gdal_geos_version__` will be `None` if GEOS is not detected. + ## Execute a sql query You can use the `sql` parameter to execute a sql query on a dataset. @@ -322,6 +348,7 @@ This function supports options to subset features from the dataset: - `max_features` - `where` - `bbox` +- `mask` ## Write a GeoPandas GeoDataFrame diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 874453f8..39046308 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -565,8 +565,8 @@ cdef apply_where_filter(OGRLayerH ogr_layer, str where): raise ValueError(f"Invalid SQL query for layer '{OGR_L_GetName(ogr_layer)}': '{where}'") -cdef apply_spatial_filter(OGRLayerH ogr_layer, bbox): - """Applies spatial filter to layer. +cdef apply_bbox_filter(OGRLayerH ogr_layer, bbox): + """Applies bounding box spatial filter to layer. Parameters ---------- @@ -586,6 +586,29 @@ cdef apply_spatial_filter(OGRLayerH ogr_layer, bbox): OGR_L_SetSpatialFilterRect(ogr_layer, xmin, ymin, xmax, ymax) +cdef apply_geometry_filter(OGRLayerH ogr_layer, wkb): + """Applies geometry spatial filter to layer. + + Parameters + ---------- + ogr_layer : pointer to open OGR layer + wkb: WKB encoding of geometry + """ + + cdef OGRGeometryH ogr_geometry = NULL + cdef unsigned char *wkb_buffer = wkb + + # err = OGR_G_ImportFromWkb(ogr_geometry, wkb_buffer, len(wkb)) + err = OGR_G_CreateFromWkb(wkb_buffer, OGR_L_GetSpatialRef(ogr_layer), &ogr_geometry, len(wkb)) + if err: + if ogr_geometry != NULL: + OGR_G_DestroyGeometry(ogr_geometry) + raise GeometryError("Could not create mask geometry") from None + + OGR_L_SetSpatialFilter(ogr_layer, ogr_geometry) + OGR_G_DestroyGeometry(ogr_geometry) + + cdef validate_feature_range(OGRLayerH ogr_layer, int skip_features=0, int max_features=0): """Limit skip_features and max_features to bounds available for dataset. @@ -1001,6 +1024,7 @@ def ogr_read( int max_features=0, object where=None, tuple bbox=None, + object mask=None, object fids=None, str sql=None, str sql_dialect=None, @@ -1021,6 +1045,7 @@ def ogr_read( path_c = path_b if fids is not None: + # TODO: probably need to add mask here too if where is not None or bbox is not None or sql is not None or skip_features or max_features: raise ValueError( "cannot set both 'fids' and any of 'where', 'bbox', 'sql', " @@ -1037,6 +1062,9 @@ def ogr_read( "be None or non-empty" ) + if bbox and mask: + raise ValueError("cannot set both 'bbox' and 'mask'") + try: dataset_options = dict_to_options(dataset_kwargs) ogr_dataset = ogr_open(path_c, 0, dataset_options) @@ -1107,7 +1135,10 @@ def ogr_read( # Apply the spatial filter if bbox is not None: - apply_spatial_filter(ogr_layer, bbox) + apply_bbox_filter(ogr_layer, bbox) + + elif mask is not None: + apply_geometry_filter(ogr_layer, mask) # Limit feature range to available range skip_features, num_features = validate_feature_range( @@ -1164,6 +1195,7 @@ def ogr_open_arrow( int max_features=0, object where=None, tuple bbox=None, + object mask=None, object fids=None, str sql=None, str sql_dialect=None, @@ -1205,6 +1237,9 @@ def ogr_open_arrow( "be None or non-empty" ) + if bbox and mask: + raise ValueError("cannot set both 'bbox' and 'mask'") + reader = None try: dataset_options = dict_to_options(dataset_kwargs) @@ -1253,7 +1288,10 @@ def ogr_open_arrow( # Apply the spatial filter if bbox is not None: - apply_spatial_filter(ogr_layer, bbox) + apply_bbox_filter(ogr_layer, bbox) + + elif mask is not None: + apply_geometry_filter(ogr_layer, mask) # Limit to specified columns if ignored_fields: @@ -1331,7 +1369,8 @@ def ogr_read_bounds( int skip_features=0, int max_features=0, object where=None, - tuple bbox=None): + tuple bbox=None, + object mask=None): cdef int err = 0 cdef const char *path_c = NULL @@ -1341,6 +1380,9 @@ def ogr_read_bounds( cdef int feature_count = 0 cdef double xmin, ymin, xmax, ymax + if bbox and mask: + raise ValueError("cannot set both 'bbox' and 'mask'") + path_b = path.encode('utf-8') path_c = path_b @@ -1357,7 +1399,10 @@ def ogr_read_bounds( # Apply the spatial filter if bbox is not None: - apply_spatial_filter(ogr_layer, bbox) + apply_bbox_filter(ogr_layer, bbox) + + elif mask is not None: + apply_geometry_filter(ogr_layer, mask) # Limit feature range to available range skip_features, num_features = validate_feature_range(ogr_layer, skip_features, max_features) diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd index 88f12d9d..6ed7ad20 100644 --- a/pyogrio/_ogr.pxd +++ b/pyogrio/_ogr.pxd @@ -255,6 +255,7 @@ cdef extern from "ogr_api.h": void OGR_Fld_SetSubType(OGRFieldDefnH fielddefn, OGRFieldSubType subtype) OGRGeometryH OGR_G_CreateGeometry(int wkbtypecode) + OGRErr OGR_G_CreateFromWkb(const void *bytes, OGRSpatialReferenceH srs, OGRGeometryH *geometry, int nbytes) void OGR_G_DestroyGeometry(OGRGeometryH geometry) void OGR_G_ExportToWkb(OGRGeometryH geometry, int endianness, unsigned char *buffer) void OGR_G_GetEnvelope(OGRGeometryH geometry, OGREnvelope* envelope) @@ -290,6 +291,7 @@ cdef extern from "ogr_api.h": OGRErr OGR_L_SetNextByIndex(OGRLayerH layer, int nIndex) int OGR_L_GetFeatureCount(OGRLayerH layer, int m) void OGR_L_SetSpatialFilterRect(OGRLayerH layer, double xmin, double ymin, double xmax, double ymax) + void OGR_L_SetSpatialFilter(OGRLayerH layer, OGRGeometryH geometry) OGRErr OGR_L_SetIgnoredFields(OGRLayerH layer, const char** fields) void OGRSetNonLinearGeometriesEnabledFlag(int bFlag) diff --git a/pyogrio/core.py b/pyogrio/core.py index d190c603..189ece8a 100644 --- a/pyogrio/core.py +++ b/pyogrio/core.py @@ -1,5 +1,5 @@ from pyogrio._env import GDALEnv -from pyogrio.util import get_vsi_path, _preprocess_options_key_value +from pyogrio.util import get_vsi_path, _preprocess_options_key_value, _mask_to_wkb with GDALEnv(): @@ -127,6 +127,7 @@ def read_bounds( max_features=None, where=None, bbox=None, + mask=None, ): """Read bounds of each feature. @@ -159,6 +160,14 @@ def read_bounds( and used by GDAL, only geometries that intersect this bbox will be returned; if GEOS is not available or not used by GDAL, all geometries with bounding boxes that intersect this bbox will be returned. + mask : Shapely geometry, optional (default: None) + If present, will be used to filter records whose geometry intersects + this geometry. This must be in the same CRS as the dataset. If GEOS is + present and used by GDAL, only geometries that intersect this geometry + will be returned; if GEOS is not available or not used by GDAL, all + geometries with bounding boxes that intersect the bounding box of this + geometry will be returned. Requires Shapely >= 2.0. + Cannot be combined with ``bbox`` keyword. Returns ------- @@ -177,6 +186,7 @@ def read_bounds( max_features=max_features or 0, where=where, bbox=bbox, + mask=_mask_to_wkb(mask), ) finally: if buffer is not None: diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 7b557c8a..0433f802 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -37,6 +37,7 @@ def read_dataframe( max_features=None, where=None, bbox=None, + mask=None, fids=None, sql=None, sql_dialect=None, @@ -93,6 +94,15 @@ def read_dataframe( and used by GDAL, only geometries that intersect this bbox will be returned; if GEOS is not available or not used by GDAL, all geometries with bounding boxes that intersect this bbox will be returned. + Cannot be combined with ``mask`` keyword. + mask : Shapely geometry, optional (default: None) + If present, will be used to filter records whose geometry intersects + this geometry. This must be in the same CRS as the dataset. If GEOS is + present and used by GDAL, only geometries that intersect this geometry + will be returned; if GEOS is not available or not used by GDAL, all + geometries with bounding boxes that intersect the bounding box of this + geometry will be returned. Requires Shapely >= 2.0. + Cannot be combined with ``bbox`` keyword. fids : array-like, optional (default: None) Array of integer feature id (FID) values to select. Cannot be combined with other keywords to select a subset (``skip_features``, ``max_features``, @@ -166,6 +176,7 @@ def read_dataframe( max_features=max_features, where=where, bbox=bbox, + mask=mask, fids=fids, sql=sql, sql_dialect=sql_dialect, diff --git a/pyogrio/raw.py b/pyogrio/raw.py index c23cdefa..179c5e56 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -3,7 +3,12 @@ from pyogrio._env import GDALEnv from pyogrio.core import detect_write_driver from pyogrio.errors import DataSourceError -from pyogrio.util import get_vsi_path, vsi_path, _preprocess_options_key_value +from pyogrio.util import ( + get_vsi_path, + vsi_path, + _preprocess_options_key_value, + _mask_to_wkb, +) with GDALEnv(): from pyogrio._io import ogr_open_arrow, ogr_read, ogr_write @@ -38,6 +43,7 @@ def read( max_features=None, where=None, bbox=None, + mask=None, fids=None, sql=None, sql_dialect=None, @@ -91,6 +97,15 @@ def read( and used by GDAL, only geometries that intersect this bbox will be returned; if GEOS is not available or not used by GDAL, all geometries with bounding boxes that intersect this bbox will be returned. + Cannot be combined with ``mask`` keyword. + mask : Shapely geometry, optional (default: None) + If present, will be used to filter records whose geometry intersects + this geometry. This must be in the same CRS as the dataset. If GEOS is + present and used by GDAL, only geometries that intersect this geometry + will be returned; if GEOS is not available or not used by GDAL, all + geometries with bounding boxes that intersect the bounding box of this + geometry will be returned. Requires Shapely >= 2.0. + Cannot be combined with ``bbox`` keyword. fids : array-like, optional (default: None) Array of integer feature id (FID) values to select. Cannot be combined with other keywords to select a subset (`skip_features`, `max_features`, @@ -137,6 +152,7 @@ def read( max_features=max_features or 0, where=where, bbox=bbox, + mask=_mask_to_wkb(mask), fids=fids, sql=sql, sql_dialect=sql_dialect, @@ -162,6 +178,7 @@ def read_arrow( max_features=None, where=None, bbox=None, + mask=None, fids=None, sql=None, sql_dialect=None, @@ -199,6 +216,7 @@ def read_arrow( max_features=max_features, where=where, bbox=bbox, + mask=mask, fids=fids, sql=sql, sql_dialect=sql_dialect, @@ -223,6 +241,7 @@ def open_arrow( max_features=None, where=None, bbox=None, + mask=None, fids=None, sql=None, sql_dialect=None, @@ -287,6 +306,7 @@ def open_arrow( max_features=max_features or 0, where=where, bbox=bbox, + mask=_mask_to_wkb(mask), fids=fids, sql=sql, sql_dialect=sql_dialect, diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py index 9859301e..d8ccf50c 100644 --- a/pyogrio/tests/test_core.py +++ b/pyogrio/tests/test_core.py @@ -1,3 +1,6 @@ +from packaging.version import Version + +import numpy as np from numpy import array_equal, allclose import pytest @@ -23,6 +26,15 @@ from pyogrio._ogr import ogr_driver_supports_write, has_gdal_data, has_proj_data +try: + import shapely + + if Version(shapely.__version__) < Version("2.0.0"): + shapely = None +except ImportError: + shapely = None + + def test_gdal_data(): # test will fail if GDAL data files cannot be found, indicating an # installation error @@ -240,6 +252,66 @@ def test_read_bounds_bbox(naturalearth_lowres_all_ext): ) +@pytest.mark.skipif(not shapely, reason="Shapely is required for mask functionality") +@pytest.mark.parametrize( + "mask", + [ + {"type": "Point", "coordinates": [0, 0]}, + '{"type": "Point", "coordinates": [0, 0]}', + "invalid", + ], +) +def test_read_bounds_mask_invalid(naturalearth_lowres, mask): + with pytest.raises(ValueError, match="'mask' parameter must be a Shapely geometry"): + read_bounds(naturalearth_lowres, mask=mask) + + +@pytest.mark.skipif(not shapely, reason="Shapely is required for mask functionality") +def test_read_bounds_bbox_mask_invalid(naturalearth_lowres): + with pytest.raises(ValueError, match="cannot set both 'bbox' and 'mask'"): + read_bounds( + naturalearth_lowres, bbox=(-85, 8, -80, 10), mask=shapely.Point(-105, 55) + ) + + +@pytest.mark.skipif(not shapely, reason="Shapely is required for mask functionality") +@pytest.mark.parametrize( + "mask,expected", + [ + (shapely.Point(-105, 55), [3]), + (shapely.box(-85, 8, -80, 10), [33, 34]), + ( + shapely.Polygon( + ( + [6.101929483362767, 50.97085041206964], + [5.773001596839322, 50.90661120482673], + [5.593156133704326, 50.642648747710325], + [6.059271089606312, 50.686051894002475], + [6.374064065737485, 50.851481340346965], + [6.101929483362767, 50.97085041206964], + ) + ), + [121, 129, 130], + ), + ( + shapely.GeometryCollection( + [shapely.Point(-7.7, 53), shapely.box(-85, 8, -80, 10)] + ), + [33, 34, 133], + ), + ], +) +def test_read_bounds_mask(naturalearth_lowres_all_ext, mask, expected): + fids = read_bounds(naturalearth_lowres_all_ext, mask=mask)[0] + + if naturalearth_lowres_all_ext.suffix == ".gpkg": + # fid in gpkg is 1-based + assert array_equal(fids, np.array(expected) + 1) + else: + # fid in other formats is 0-based + assert array_equal(fids, expected) + + @pytest.mark.skipif( __gdal_version__ < (3, 4, 0), reason="Cannot determine if GEOS is present or absent for GDAL < 3.4", diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index d0a33bb3..f475c155 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -23,7 +23,8 @@ from geopandas.array import from_wkt from geopandas.testing import assert_geodataframe_equal - from shapely.geometry import Point + from shapely.geometry import box, GeometryCollection, Point, Polygon + except ImportError: pass @@ -309,6 +310,101 @@ def test_read_bbox(naturalearth_lowres_all_ext): assert np.array_equal(df.iso_a3, ["PAN", "CRI"]) +@pytest.mark.parametrize( + "use_arrow", + [ + pytest.param( + True, + marks=pytest.mark.skipif( + not has_pyarrow or __gdal_version__ < (3, 6, 0), + reason="Arrow tests require pyarrow and GDAL>=3.6", + ), + ), + False, + ], +) +@pytest.mark.parametrize( + "mask", + [ + {"type": "Point", "coordinates": [0, 0]}, + '{"type": "Point", "coordinates": [0, 0]}', + "invalid", + ], +) +def test_read_mask_invalid(naturalearth_lowres, use_arrow, mask): + with pytest.raises(ValueError, match="'mask' parameter must be a Shapely geometry"): + read_dataframe(naturalearth_lowres, use_arrow=use_arrow, mask=mask) + + +@pytest.mark.parametrize( + "use_arrow", + [ + pytest.param( + True, + marks=pytest.mark.skipif( + not has_pyarrow or __gdal_version__ < (3, 6, 0), + reason="Arrow tests require pyarrow and GDAL>=3.6", + ), + ), + False, + ], +) +def test_read_bbox_mask_invalid(naturalearth_lowres, use_arrow): + with pytest.raises(ValueError, match="cannot set both 'bbox' and 'mask'"): + read_dataframe( + naturalearth_lowres, + use_arrow=use_arrow, + bbox=(-85, 8, -80, 10), + mask=Point(-105, 55), + ) + + +@pytest.mark.parametrize( + "use_arrow", + [ + pytest.param( + True, + marks=pytest.mark.skipif( + not has_pyarrow or __gdal_version__ < (3, 6, 0), + reason="Arrow tests require pyarrow and GDAL>=3.6", + ), + ), + False, + ], +) +@pytest.mark.parametrize( + "mask,expected", + [ + (Point(-105, 55), ["CAN"]), + (box(-85, 8, -80, 10), ["PAN", "CRI"]), + ( + Polygon( + ( + [6.101929483362767, 50.97085041206964], + [5.773001596839322, 50.90661120482673], + [5.593156133704326, 50.642648747710325], + [6.059271089606312, 50.686051894002475], + [6.374064065737485, 50.851481340346965], + [6.101929483362767, 50.97085041206964], + ) + ), + ["DEU", "BEL", "NLD"], + ), + ( + GeometryCollection([Point(-7.7, 53), box(-85, 8, -80, 10)]), + ["PAN", "CRI", "IRL"], + ), + ], +) +def test_read_mask(naturalearth_lowres_all_ext, use_arrow, mask, expected): + df = read_dataframe(naturalearth_lowres_all_ext, use_arrow=True, mask=mask) + + print(df.iso_a3) + + assert len(df) == len(expected) + assert np.array_equal(df.iso_a3, expected) + + def test_read_fids(naturalearth_lowres_all_ext): # ensure keyword is properly passed through fids = np.array([1, 10, 5], dtype=np.int64) diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index aa65fb5e..9330a8e3 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -1,6 +1,7 @@ import contextlib import json import os +from packaging.version import Version import sys import numpy as np @@ -18,6 +19,14 @@ from pyogrio.errors import DataSourceError, DataLayerError, FeatureError from pyogrio.tests.conftest import prepare_testfile, DRIVERS, DRIVER_EXT +try: + import shapely + + if Version(shapely.__version__) < Version("2.0.0"): + shapely = None +except ImportError: + shapely = None + def test_read(naturalearth_lowres): meta, _, geometry, fields = read(naturalearth_lowres) @@ -200,6 +209,60 @@ def test_read_bbox(naturalearth_lowres_all_ext): assert np.array_equal(fields[3], ["PAN", "CRI"]) +@pytest.mark.skipif(not shapely, reason="Shapely is required for mask functionality") +@pytest.mark.parametrize( + "mask", + [ + {"type": "Point", "coordinates": [0, 0]}, + '{"type": "Point", "coordinates": [0, 0]}', + "invalid", + ], +) +def test_read_mask_invalid(naturalearth_lowres, mask): + with pytest.raises(ValueError, match="'mask' parameter must be a Shapely geometry"): + read(naturalearth_lowres, mask=mask) + + +@pytest.mark.skipif(not shapely, reason="Shapely is required for mask functionality") +def test_read_bbox_mask_invalid(naturalearth_lowres): + with pytest.raises(ValueError, match="cannot set both 'bbox' and 'mask'"): + read(naturalearth_lowres, bbox=(-85, 8, -80, 10), mask=shapely.Point(-105, 55)) + + +@pytest.mark.skipif(not shapely, reason="Shapely is required for mask functionality") +@pytest.mark.parametrize( + "mask,expected", + [ + (shapely.Point(-105, 55), ["CAN"]), + (shapely.box(-85, 8, -80, 10), ["PAN", "CRI"]), + ( + shapely.Polygon( + ( + [6.101929483362767, 50.97085041206964], + [5.773001596839322, 50.90661120482673], + [5.593156133704326, 50.642648747710325], + [6.059271089606312, 50.686051894002475], + [6.374064065737485, 50.851481340346965], + [6.101929483362767, 50.97085041206964], + ) + ), + ["DEU", "BEL", "NLD"], + ), + ( + shapely.GeometryCollection( + [shapely.Point(-7.7, 53), shapely.box(-85, 8, -80, 10)] + ), + ["PAN", "CRI", "IRL"], + ), + ], +) +def test_read_mask(naturalearth_lowres_all_ext, mask, expected): + geometry, fields = read(naturalearth_lowres_all_ext, mask=mask)[2:] + + assert np.array_equal(fields[3], expected) + assert len(geometry) == len(expected) + + def test_read_fids(naturalearth_lowres): expected_fids, expected_geometry, expected_fields = read( naturalearth_lowres, return_fids=True diff --git a/pyogrio/util.py b/pyogrio/util.py index 1448657b..7b97efa1 100644 --- a/pyogrio/util.py +++ b/pyogrio/util.py @@ -1,3 +1,4 @@ +from packaging.version import Version import re import sys from urllib.parse import urlparse @@ -161,3 +162,41 @@ def _preprocess_options_key_value(options): v = str(v) result[k] = v return result + + +def _mask_to_wkb(mask): + """Convert a Shapely mask geometry to WKB. + + Parameters + ---------- + mask : Shapely geometry + + Returns + ------- + WKB bytes or None + + Raises + ------ + ValueError + raised if Shapely >= 2.0 is not available or mask is not a Shapely + Geometry object + """ + + if mask is None: + return mask + + try: + import shapely + + if Version(shapely.__version__) < Version("2.0.0"): + shapely = None + except ImportError: + shapely = None + + if not shapely: + raise ValueError("'mask' parameter requires Shapely >= 2.0") + + if not isinstance(mask, shapely.Geometry): + raise ValueError("'mask' parameter must be a Shapely geometry") + + return shapely.to_wkb(mask) From 5efa9e9072ad412dafec166f0564c4d8555f071e Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Wed, 6 Sep 2023 18:30:58 -0700 Subject: [PATCH 02/11] add xfail for arrow / gpkg when using mask or bbox --- CHANGES.md | 4 +- docs/source/known_issues.md | 10 +++++ pyogrio/_io.pyx | 10 ++--- pyogrio/geopandas.py | 11 +++--- pyogrio/raw.py | 11 +++--- pyogrio/tests/test_geopandas_io.py | 62 ++++++++++++++++++++++++------ pyogrio/tests/test_raw_io.py | 6 +++ 7 files changed, 86 insertions(+), 28 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 97818bd6..3ce3c908 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -15,7 +15,9 @@ write drivers and addition of `detect_write_driver` (#270) - Addition of `mask` parameter to `open_arrow`, `read`, `read_dataframe`, and `read_bounds` functions to select only the features in the dataset that - intersect the mask geometry. + intersect the mask geometry (#285). Note: there is a known GDAL bug when + using the Arrow interface that causes it to return too many features; a fix + is expected in GDAL 3.8.0. ### Bug fixes diff --git a/docs/source/known_issues.md b/docs/source/known_issues.md index a35bbe1d..3d84215e 100644 --- a/docs/source/known_issues.md +++ b/docs/source/known_issues.md @@ -107,3 +107,13 @@ We recommend the following to sidestep performance issues: - the `use_arrow=True` option may speed up reading from OSM files - if possible, use a different tool such as `ogr2ogr` to translate the OSM data source into a more performant format for reading by layer, such as GPKG + +## Incorrect results when using a spatial filter and Arrow interface + +Due to [a bug in GDAL](https://github.com/OSGeo/gdal/issues/8347), when using +the Arrow interface (e.g., via `use_arrow` on `read_dataframe`) certain drivers +(e.g., GPKG, FlatGeobuf, Arrow, Parquet) returned features whose bounding boxes +intersected the bounding box specified by `bbox` or `mask` geometry instead of +those whose geometry intersected the `bbox` or `mask`. + +A fix is expected in GDAL 3.8.0. diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 39046308..c26499f0 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -598,8 +598,7 @@ cdef apply_geometry_filter(OGRLayerH ogr_layer, wkb): cdef OGRGeometryH ogr_geometry = NULL cdef unsigned char *wkb_buffer = wkb - # err = OGR_G_ImportFromWkb(ogr_geometry, wkb_buffer, len(wkb)) - err = OGR_G_CreateFromWkb(wkb_buffer, OGR_L_GetSpatialRef(ogr_layer), &ogr_geometry, len(wkb)) + err = OGR_G_CreateFromWkb(wkb_buffer, NULL, &ogr_geometry, len(wkb)) if err: if ogr_geometry != NULL: OGR_G_DestroyGeometry(ogr_geometry) @@ -1045,11 +1044,10 @@ def ogr_read( path_c = path_b if fids is not None: - # TODO: probably need to add mask here too - if where is not None or bbox is not None or sql is not None or skip_features or max_features: + if where is not None or bbox is not None or mask is not None or sql is not None or skip_features or max_features: raise ValueError( - "cannot set both 'fids' and any of 'where', 'bbox', 'sql', " - "'skip_features' or 'max_features'" + "cannot set both 'fids' and any of 'where', 'bbox', 'mask', " + "'sql', 'skip_features' or 'max_features'" ) fids = np.asarray(fids, dtype=np.intc) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 0433f802..a3fa9656 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -105,11 +105,12 @@ def read_dataframe( Cannot be combined with ``bbox`` keyword. fids : array-like, optional (default: None) Array of integer feature id (FID) values to select. Cannot be combined - with other keywords to select a subset (``skip_features``, ``max_features``, - ``where``, ``bbox`` or ``sql``). Note that the starting index is driver and file - specific (e.g. typically 0 for Shapefile and 1 for GeoPackage, but can - still depend on the specific file). The performance of reading a large - number of features usings FIDs is also driver specific. + with other keywords to select a subset (``skip_features``, + ``max_features``, ``where``, ``bbox``, ``mask``, or ``sql``). Note that + the starting index is driver and file specific (e.g. typically 0 for + Shapefile and 1 for GeoPackage, but can still depend on the specific + file). The performance of reading a large number of features usings FIDs + is also driver specific. sql : str, optional (default: None) The sql statement to execute. Look at the sql_dialect parameter for more information on the syntax to use for the query. When combined diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 179c5e56..1dcf63e2 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -108,11 +108,12 @@ def read( Cannot be combined with ``bbox`` keyword. fids : array-like, optional (default: None) Array of integer feature id (FID) values to select. Cannot be combined - with other keywords to select a subset (`skip_features`, `max_features`, - `where` or `bbox`). Note that the starting index is driver and file - specific (e.g. typically 0 for Shapefile and 1 for GeoPackage, but can - still depend on the specific file). The performance of reading a large - number of features usings FIDs is also driver specific. + with other keywords to select a subset (``skip_features``, + ``max_features``, ``where``, ``bbox``, or ``mask``). Note that the + starting index is driver and file specific (e.g. typically 0 for + Shapefile and 1 for GeoPackage, but can still depend on the specific + file). The performance of reading a large number of features usings FIDs + is also driver specific. return_fids : bool, optional (default: False) If True, will return the FIDs of the feature that were read. **kwargs diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index f475c155..ea3fd6c6 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -298,16 +298,46 @@ def test_read_bbox_invalid(naturalearth_lowres_all_ext, bbox): read_dataframe(naturalearth_lowres_all_ext, bbox=bbox) -def test_read_bbox(naturalearth_lowres_all_ext): - # should return no features - with pytest.warns(UserWarning, match="does not have any features to read"): - df = read_dataframe(naturalearth_lowres_all_ext, bbox=(0, 0, 0.00001, 0.00001)) - assert len(df) == 0 +@pytest.mark.parametrize( + "use_arrow", + [ + pytest.param( + True, + marks=pytest.mark.skipif( + not has_pyarrow or __gdal_version__ < (3, 6, 0), + reason="Arrow tests require pyarrow and GDAL>=3.6", + ), + ), + False, + ], +) +@pytest.mark.parametrize( + "bbox,expected", + [ + ((0, 0, 0.00001, 0.00001), []), + ((-85, 8, -80, 10), ["PAN", "CRI"]), + ((-104, 54, -105, 55), ["CAN"]), + ], +) +def test_read_bbox(naturalearth_lowres_all_ext, use_arrow, bbox, expected): + if ( + use_arrow + and __gdal_version__ < (3, 8, 0) + and os.path.splitext(naturalearth_lowres_all_ext)[1] == ".gpkg" + ): + pytest.xfail(reason="GDAL bug: https://github.com/OSGeo/gdal/issues/8347") - df = read_dataframe(naturalearth_lowres_all_ext, bbox=(-85, 8, -80, 10)) - assert len(df) == 2 + if len(expected) == 0 and not use_arrow: + # should return no features + with pytest.warns(UserWarning, match="does not have any features to read"): + df = read_dataframe( + naturalearth_lowres_all_ext, use_arrow=use_arrow, bbox=bbox + ) + + else: + df = read_dataframe(naturalearth_lowres_all_ext, use_arrow=use_arrow, bbox=bbox) - assert np.array_equal(df.iso_a3, ["PAN", "CRI"]) + assert np.array_equal(df.iso_a3, expected) @pytest.mark.parametrize( @@ -396,10 +426,20 @@ def test_read_bbox_mask_invalid(naturalearth_lowres, use_arrow): ), ], ) -def test_read_mask(naturalearth_lowres_all_ext, use_arrow, mask, expected): - df = read_dataframe(naturalearth_lowres_all_ext, use_arrow=True, mask=mask) +def test_read_mask( + naturalearth_lowres_all_ext, + use_arrow, + mask, + expected, +): + if ( + use_arrow + and __gdal_version__ < (3, 8, 0) + and os.path.splitext(naturalearth_lowres_all_ext)[1] == ".gpkg" + ): + pytest.xfail(reason="GDAL bug: https://github.com/OSGeo/gdal/issues/8347") - print(df.iso_a3) + df = read_dataframe(naturalearth_lowres_all_ext, use_arrow=use_arrow, mask=mask) assert len(df) == len(expected) assert np.array_equal(df.iso_a3, expected) diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 9330a8e3..08d154d3 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -310,6 +310,12 @@ def test_read_fids_unsupported_keywords(naturalearth_lowres): with pytest.raises(ValueError, match="cannot set both 'fids' and any of"): read(naturalearth_lowres, fids=[1], max_features=5) + with pytest.raises(ValueError, match="cannot set both 'fids' and any of"): + read(naturalearth_lowres, fids=[1], bbox=(0, 0, 0.0001, 0.0001)) + + with pytest.raises(ValueError, match="cannot set both 'fids' and any of"): + read(naturalearth_lowres, fids=[1], mask=shapely.Point(0, 0)) + def test_read_return_fids(naturalearth_lowres): # default is to not return fids From 472f232aacd1c9e88bb4e9aac3eaae9cbfc06211 Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Wed, 6 Sep 2023 18:43:08 -0700 Subject: [PATCH 03/11] Skip mask param if shapely not available --- pyogrio/tests/test_raw_io.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 08d154d3..d585e7d4 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -313,8 +313,9 @@ def test_read_fids_unsupported_keywords(naturalearth_lowres): with pytest.raises(ValueError, match="cannot set both 'fids' and any of"): read(naturalearth_lowres, fids=[1], bbox=(0, 0, 0.0001, 0.0001)) - with pytest.raises(ValueError, match="cannot set both 'fids' and any of"): - read(naturalearth_lowres, fids=[1], mask=shapely.Point(0, 0)) + if shapely is not None: + with pytest.raises(ValueError, match="cannot set both 'fids' and any of"): + read(naturalearth_lowres, fids=[1], mask=shapely.Point(0, 0)) def test_read_return_fids(naturalearth_lowres): From 5ac6f644355a132a9f40662717b65c74c0489cbc Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Tue, 26 Sep 2023 11:12:27 -0700 Subject: [PATCH 04/11] PR feedback --- docs/source/introduction.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/introduction.md b/docs/source/introduction.md index 97338870..f2d81434 100644 --- a/docs/source/introduction.md +++ b/docs/source/introduction.md @@ -207,7 +207,7 @@ with a Shapely (>= 2.0) geometry. ```python >>> mask = shapely.Polygon(([-80,8], [-80, 10], [-85,10], [-85,8], [-80,8])) ->>> read_dataframe('ne_10m_admin_0_countries.shp', mask) +>>> read_dataframe('ne_10m_admin_0_countries.shp', mask=mask) ``` Note: the `mask` values must be in the same CRS as the dataset. @@ -218,7 +218,7 @@ need to convert it to a Shapely geometry before using `mask`. ```python >>> mask_geojson = '{"type":"Polygon","coordinates":[[[-80.0,8.0],[-80.0,10.0],[-85.0,10.0],[-85.0,8.0],[-80.0,8.0]]]}' >>> mask = shapely.from_geojson(mask_geojson) ->>> read_dataframe('ne_10m_admin_0_countries.shp', mask) +>>> read_dataframe('ne_10m_admin_0_countries.shp', mask=mask) ``` Note: if GEOS is present and used by GDAL, only geometries that intersect `mask` From 5bc521bd350e4254eaee62b1b8866733c75b5a38 Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Tue, 26 Sep 2023 11:51:14 -0700 Subject: [PATCH 05/11] Refactor tests that require shapely --- pyogrio/tests/test_raw_io.py | 47 ++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index c5df4786..0d5369a9 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -215,7 +215,9 @@ def test_read_bbox(naturalearth_lowres_all_ext): assert np.array_equal(fields[3], ["PAN", "CRI"]) -@pytest.mark.skipif(not shapely, reason="Shapely is required for mask functionality") +@pytest.mark.skipif( + not HAS_SHAPELY, reason="Shapely is required for mask functionality" +) @pytest.mark.parametrize( "mask", [ @@ -229,40 +231,45 @@ def test_read_mask_invalid(naturalearth_lowres, mask): read(naturalearth_lowres, mask=mask) -@pytest.mark.skipif(not shapely, reason="Shapely is required for mask functionality") +@pytest.mark.skipif( + not HAS_SHAPELY, reason="Shapely is required for mask functionality" +) def test_read_bbox_mask_invalid(naturalearth_lowres): with pytest.raises(ValueError, match="cannot set both 'bbox' and 'mask'"): read(naturalearth_lowres, bbox=(-85, 8, -80, 10), mask=shapely.Point(-105, 55)) -@pytest.mark.skipif(not shapely, reason="Shapely is required for mask functionality") +@pytest.mark.skipif( + not HAS_SHAPELY, reason="Shapely is required for mask functionality" +) @pytest.mark.parametrize( "mask,expected", [ - (shapely.Point(-105, 55), ["CAN"]), - (shapely.box(-85, 8, -80, 10), ["PAN", "CRI"]), + ("POINT (-105 55)", ["CAN"]), + ("POLYGON ((-80 8, -80 10, -85 10, -85 8, -80 8))", ["PAN", "CRI"]), ( - shapely.Polygon( - ( - [6.101929483362767, 50.97085041206964], - [5.773001596839322, 50.90661120482673], - [5.593156133704326, 50.642648747710325], - [6.059271089606312, 50.686051894002475], - [6.374064065737485, 50.851481340346965], - [6.101929483362767, 50.97085041206964], - ) - ), + """POLYGON (( + 6.101929 50.97085, + 5.773002 50.906611, + 5.593156 50.642649, + 6.059271 50.686052, + 6.374064 50.851481, + 6.101929 50.97085 + ))""", ["DEU", "BEL", "NLD"], ), ( - shapely.GeometryCollection( - [shapely.Point(-7.7, 53), shapely.box(-85, 8, -80, 10)] - ), + """GEOMETRYCOLLECTION ( + POINT (-7.7 53), + POLYGON ((-80 8, -80 10, -85 10, -85 8, -80 8)) + )""", ["PAN", "CRI", "IRL"], ), ], ) def test_read_mask(naturalearth_lowres_all_ext, mask, expected): + mask = shapely.from_wkt(mask) + geometry, fields = read(naturalearth_lowres_all_ext, mask=mask)[2:] assert np.array_equal(fields[3], expected) @@ -319,7 +326,7 @@ def test_read_fids_unsupported_keywords(naturalearth_lowres): with pytest.raises(ValueError, match="cannot set both 'fids' and any of"): read(naturalearth_lowres, fids=[1], bbox=(0, 0, 0.0001, 0.0001)) - if shapely is not None: + if HAS_SHAPELY: with pytest.raises(ValueError, match="cannot set both 'fids' and any of"): read(naturalearth_lowres, fids=[1], mask=shapely.Point(0, 0)) @@ -661,8 +668,6 @@ def assert_equal_result(result1, result2): assert all([np.array_equal(f1, f2) for f1, f2 in zip(field_data1, field_data2)]) if HAS_SHAPELY: - import shapely - # a plain `assert np.array_equal(geometry1, geometry2)` doesn't work # because the WKB values are not exactly equal, therefore parsing with # shapely to compare with tolerance From acf4a5accc172ea2fabda02a5b2dc2996e3b36bc Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Tue, 26 Sep 2023 11:59:09 -0700 Subject: [PATCH 06/11] Refactor other tests that require shapely --- pyogrio/tests/test_core.py | 52 +++++++++++++++++++----------------- pyogrio/tests/test_raw_io.py | 6 +---- 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py index c2889910..c65d4739 100644 --- a/pyogrio/tests/test_core.py +++ b/pyogrio/tests/test_core.py @@ -1,5 +1,3 @@ -from packaging.version import Version - import numpy as np from numpy import array_equal, allclose import pytest @@ -17,7 +15,7 @@ ) from pyogrio.core import detect_write_driver from pyogrio.errors import DataSourceError, DataLayerError -from pyogrio.tests.conftest import prepare_testfile +from pyogrio.tests.conftest import HAS_SHAPELY, prepare_testfile from pyogrio._env import GDALEnv @@ -29,11 +27,8 @@ try: import shapely - - if Version(shapely.__version__) < Version("2.0.0"): - shapely = None except ImportError: - shapely = None + pass def test_gdal_data(): @@ -253,7 +248,9 @@ def test_read_bounds_bbox(naturalearth_lowres_all_ext): ) -@pytest.mark.skipif(not shapely, reason="Shapely is required for mask functionality") +@pytest.mark.skipif( + not HAS_SHAPELY, reason="Shapely is required for mask functionality" +) @pytest.mark.parametrize( "mask", [ @@ -267,7 +264,9 @@ def test_read_bounds_mask_invalid(naturalearth_lowres, mask): read_bounds(naturalearth_lowres, mask=mask) -@pytest.mark.skipif(not shapely, reason="Shapely is required for mask functionality") +@pytest.mark.skipif( + not HAS_SHAPELY, reason="Shapely is required for mask functionality" +) def test_read_bounds_bbox_mask_invalid(naturalearth_lowres): with pytest.raises(ValueError, match="cannot set both 'bbox' and 'mask'"): read_bounds( @@ -275,34 +274,37 @@ def test_read_bounds_bbox_mask_invalid(naturalearth_lowres): ) -@pytest.mark.skipif(not shapely, reason="Shapely is required for mask functionality") +@pytest.mark.skipif( + not HAS_SHAPELY, reason="Shapely is required for mask functionality" +) @pytest.mark.parametrize( "mask,expected", [ - (shapely.Point(-105, 55), [3]), - (shapely.box(-85, 8, -80, 10), [33, 34]), + ("POINT (-105 55)", [3]), + ("POLYGON ((-80 8, -80 10, -85 10, -85 8, -80 8))", [33, 34]), ( - shapely.Polygon( - ( - [6.101929483362767, 50.97085041206964], - [5.773001596839322, 50.90661120482673], - [5.593156133704326, 50.642648747710325], - [6.059271089606312, 50.686051894002475], - [6.374064065737485, 50.851481340346965], - [6.101929483362767, 50.97085041206964], - ) - ), + """POLYGON (( + 6.101929 50.97085, + 5.773002 50.906611, + 5.593156 50.642649, + 6.059271 50.686052, + 6.374064 50.851481, + 6.101929 50.97085 + ))""", [121, 129, 130], ), ( - shapely.GeometryCollection( - [shapely.Point(-7.7, 53), shapely.box(-85, 8, -80, 10)] - ), + """GEOMETRYCOLLECTION ( + POINT (-7.7 53), + POLYGON ((-80 8, -80 10, -85 10, -85 8, -80 8)) + )""", [33, 34, 133], ), ], ) def test_read_bounds_mask(naturalearth_lowres_all_ext, mask, expected): + mask = shapely.from_wkt(mask) + fids = read_bounds(naturalearth_lowres_all_ext, mask=mask)[0] if naturalearth_lowres_all_ext.suffix == ".gpkg": diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 0d5369a9..91663c5b 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -1,7 +1,6 @@ import contextlib import json import os -from packaging.version import Version import sys import numpy as np @@ -27,11 +26,8 @@ try: import shapely - - if Version(shapely.__version__) < Version("2.0.0"): - shapely = None except ImportError: - shapely = None + pass def test_read(naturalearth_lowres): From 6f126f0e15c0c846647196726a57de0696599aee Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Tue, 26 Sep 2023 12:36:22 -0700 Subject: [PATCH 07/11] Update changes --- CHANGES.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 69f0cf66..05660082 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -17,9 +17,9 @@ write drivers and addition of `detect_write_driver` (#270) - Addition of `mask` parameter to `open_arrow`, `read`, `read_dataframe`, and `read_bounds` functions to select only the features in the dataset that - intersect the mask geometry (#285). Note: there is a known GDAL bug when - using the Arrow interface that causes it to return too many features; a fix - is expected in GDAL 3.8.0. + intersect the mask geometry (#285). Note: GDAL < 3.8.0 returns features that + intersect the bounding box of the mask when using the Arrow interface for + some drivers; this has been fixed in GDAL 3.8.0. ### Other changes @@ -41,7 +41,7 @@ - the `features` property in the result will now be -1 if calculating the feature count is an expensive operation for this driver. You can force it to be calculated using the `force_feature_count` parameter. - - for boolean values in the `capabilities` property, the values will now be + - for boolean values in the `capabilities` property, the values will now be booleans instead of 1 or 0. ## 0.6.0 (2023-04-27) From 7b94721fb2c62c3c1f070aa85501b52aad8cde5a Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Thu, 28 Sep 2023 09:41:44 -0700 Subject: [PATCH 08/11] Add tests of bbox and mask with sql & where --- pyogrio/geopandas.py | 3 +- pyogrio/raw.py | 5 ++-- pyogrio/tests/test_geopandas_io.py | 44 ++++++++++++++++++++++++++++++ pyogrio/tests/test_raw_io.py | 43 +++++++++++++++++++++++++++++ 4 files changed, 90 insertions(+), 5 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 930037b1..b4de7c64 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -121,8 +121,7 @@ def read_dataframe( with other keywords like ``columns``, ``skip_features``, ``max_features``, ``where`` or ``bbox``, those are applied after the SQL query. Be aware that this can have an impact on performance, - (e.g. filtering with the ``bbox`` keyword may not use - spatial indexes). + (e.g. filtering with the ``bbox`` keyword may not use spatial indexes). Cannot be combined with the ``layer`` or ``fids`` keywords. sql_dialect : str, optional (default: None) The SQL dialect the SQL statement is written in. Possible values: diff --git a/pyogrio/raw.py b/pyogrio/raw.py index b116719c..6ffae007 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -113,7 +113,7 @@ def read( Array of integer feature id (FID) values to select. Cannot be combined with other keywords to select a subset (``skip_features``, ``max_features``, ``where``, ``bbox``, or ``mask``). Note that the - starting index is driver and file specific (e.g. typically 0 for + starting index is driver and file specific (e.g. typically 0 for Shapefile and 1 for GeoPackage, but can still depend on the specific file). The performance of reading a large number of features usings FIDs is also driver specific. @@ -123,8 +123,7 @@ def read( with other keywords like ``columns``, ``skip_features``, ``max_features``, ``where`` or ``bbox``, those are applied after the SQL query. Be aware that this can have an impact on performance, - (e.g. filtering with the ``bbox`` keyword may not use - spatial indexes). + (e.g. filtering with the ``bbox`` keyword may not use spatial indexes). Cannot be combined with the ``layer`` or ``fids`` keywords. sql_dialect : str, optional (default: None) The SQL dialect the ``sql`` statement is written in. Possible values: diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index d0e51cac..fa6c553a 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -291,6 +291,28 @@ def test_read_bbox(naturalearth_lowres_all_ext, use_arrow, bbox, expected): assert np.array_equal(df.iso_a3, expected) +def test_read_bbox_sql(naturalearth_lowres_all_ext, use_arrow): + df = read_dataframe( + naturalearth_lowres_all_ext, + use_arrow=use_arrow, + bbox=(-180, 50, -100, 90), + sql="SELECT * from naturalearth_lowres where iso_a3 not in ('USA', 'RUS')", + ) + assert len(df) == 1 + assert np.array_equal(df.iso_a3, ["CAN"]) + + +def test_read_bbox_where(naturalearth_lowres_all_ext, use_arrow): + df = read_dataframe( + naturalearth_lowres_all_ext, + use_arrow=use_arrow, + bbox=(-180, 50, -100, 90), + where="iso_a3 not in ('USA', 'RUS')", + ) + assert len(df) == 1 + assert np.array_equal(df.iso_a3, ["CAN"]) + + @pytest.mark.parametrize( "mask", [ @@ -359,6 +381,28 @@ def test_read_mask( assert np.array_equal(df.iso_a3, expected) +def test_read_mask_sql(naturalearth_lowres_all_ext, use_arrow): + df = read_dataframe( + naturalearth_lowres_all_ext, + use_arrow=use_arrow, + mask=shapely.box(-180, 50, -100, 90), + sql="SELECT * from naturalearth_lowres where iso_a3 not in ('USA', 'RUS')", + ) + assert len(df) == 1 + assert np.array_equal(df.iso_a3, ["CAN"]) + + +def test_read_mask_where(naturalearth_lowres_all_ext, use_arrow): + df = read_dataframe( + naturalearth_lowres_all_ext, + use_arrow=use_arrow, + mask=shapely.box(-180, 50, -100, 90), + where="iso_a3 not in ('USA', 'RUS')", + ) + assert len(df) == 1 + assert np.array_equal(df.iso_a3, ["CAN"]) + + def test_read_fids(naturalearth_lowres_all_ext): # ensure keyword is properly passed through fids = np.array([1, 10, 5], dtype=np.int64) diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 91663c5b..35052803 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -211,6 +211,26 @@ def test_read_bbox(naturalearth_lowres_all_ext): assert np.array_equal(fields[3], ["PAN", "CRI"]) +def test_read_bbox_sql(naturalearth_lowres_all_ext): + fields = read( + naturalearth_lowres_all_ext, + bbox=(-180, 50, -100, 90), + sql="SELECT * from naturalearth_lowres where iso_a3 not in ('USA', 'RUS')", + )[3] + assert len(fields[3]) == 1 + assert np.array_equal(fields[3], ["CAN"]) + + +def test_read_bbox_where(naturalearth_lowres_all_ext): + fields = read( + naturalearth_lowres_all_ext, + bbox=(-180, 50, -100, 90), + where="iso_a3 not in ('USA', 'RUS')", + )[3] + assert len(fields[3]) == 1 + assert np.array_equal(fields[3], ["CAN"]) + + @pytest.mark.skipif( not HAS_SHAPELY, reason="Shapely is required for mask functionality" ) @@ -272,6 +292,29 @@ def test_read_mask(naturalearth_lowres_all_ext, mask, expected): assert len(geometry) == len(expected) +@pytest.mark.skipif( + not HAS_SHAPELY, reason="Shapely is required for mask functionality" +) +def test_read_mask_sql(naturalearth_lowres_all_ext): + fields = read( + naturalearth_lowres_all_ext, + mask=shapely.box(-180, 50, -100, 90), + sql="SELECT * from naturalearth_lowres where iso_a3 not in ('USA', 'RUS')", + )[3] + assert len(fields[3]) == 1 + assert np.array_equal(fields[3], ["CAN"]) + + +def test_read_mask_where(naturalearth_lowres_all_ext): + fields = read( + naturalearth_lowres_all_ext, + mask=shapely.box(-180, 50, -100, 90), + where="iso_a3 not in ('USA', 'RUS')", + )[3] + assert len(fields[3]) == 1 + assert np.array_equal(fields[3], ["CAN"]) + + def test_read_fids(naturalearth_lowres): expected_fids, expected_geometry, expected_fields = read( naturalearth_lowres, return_fids=True From d277c87e6cbc56e02daf29e16eb075b7ec6ce629 Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Thu, 28 Sep 2023 09:49:18 -0700 Subject: [PATCH 09/11] fix missing skipif for shapely --- pyogrio/tests/test_raw_io.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 35052803..99d06c79 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -305,6 +305,9 @@ def test_read_mask_sql(naturalearth_lowres_all_ext): assert np.array_equal(fields[3], ["CAN"]) +@pytest.mark.skipif( + not HAS_SHAPELY, reason="Shapely is required for mask functionality" +) def test_read_mask_where(naturalearth_lowres_all_ext): fields = read( naturalearth_lowres_all_ext, From 6560c7ae9a947fc3ec67396c0bad90fb8b984573 Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Fri, 29 Sep 2023 08:35:35 -0700 Subject: [PATCH 10/11] Update docstrings --- pyogrio/geopandas.py | 12 ++++++------ pyogrio/raw.py | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index b4de7c64..8566c94b 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -116,12 +116,12 @@ def read_dataframe( file). The performance of reading a large number of features usings FIDs is also driver specific. sql : str, optional (default: None) - The SQL statement to execute. Look at the sql_dialect parameter for - more information on the syntax to use for the query. When combined - with other keywords like ``columns``, ``skip_features``, - ``max_features``, ``where`` or ``bbox``, those are applied after the - SQL query. Be aware that this can have an impact on performance, - (e.g. filtering with the ``bbox`` keyword may not use spatial indexes). + The SQL statement to execute. Look at the sql_dialect parameter for more + information on the syntax to use for the query. When combined with other + keywords like ``columns``, ``skip_features``, ``max_features``, + ``where``, ``bbox``, or ``mask``, those are applied after the SQL query. + Be aware that this can have an impact on performance, (e.g. filtering + with the ``bbox`` or ``mask`` keywords may not use spatial indexes). Cannot be combined with the ``layer`` or ``fids`` keywords. sql_dialect : str, optional (default: None) The SQL dialect the SQL statement is written in. Possible values: diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 6ffae007..ba88d29e 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -118,12 +118,12 @@ def read( file). The performance of reading a large number of features usings FIDs is also driver specific. sql : str, optional (default: None) - The SQL statement to execute. See the sql_dialect parameter for - more information on the syntax to use for the query. When combined - with other keywords like ``columns``, ``skip_features``, - ``max_features``, ``where`` or ``bbox``, those are applied after the - SQL query. Be aware that this can have an impact on performance, - (e.g. filtering with the ``bbox`` keyword may not use spatial indexes). + The SQL statement to execute. Look at the sql_dialect parameter for more + information on the syntax to use for the query. When combined with other + keywords like ``columns``, ``skip_features``, ``max_features``, + ``where``, ``bbox``, or ``mask``, those are applied after the SQL query. + Be aware that this can have an impact on performance, (e.g. filtering + with the ``bbox`` or ``mask`` keywords may not use spatial indexes). Cannot be combined with the ``layer`` or ``fids`` keywords. sql_dialect : str, optional (default: None) The SQL dialect the ``sql`` statement is written in. Possible values: From 64afe8213fac35f5a9907989bebc998f4cfe0f0d Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Fri, 29 Sep 2023 14:37:36 -0700 Subject: [PATCH 11/11] Merge branch 'main' into add_mask --- .github/workflows/docker-gdal.yml | 2 +- .github/workflows/release.yml | 11 +-- CHANGES.md | 10 +++ ci/manylinux2014_x86_64-vcpkg-gdal.Dockerfile | 2 +- ...nylinux_2_28_aarch64-vcpkg-gdal.Dockerfile | 2 +- ci/requirements-wheel-test.txt | 6 +- docs/source/introduction.md | 10 +++ pyogrio/_io.pyx | 32 ++++++--- pyogrio/geopandas.py | 14 ++-- pyogrio/raw.py | 65 ++++++++++++++--- pyogrio/tests/test_arrow.py | 70 ++++++++++++++++++- pyogrio/tests/test_core.py | 7 +- pyogrio/tests/test_geopandas_io.py | 33 +++++---- pyogrio/tests/test_raw_io.py | 32 +++++---- 14 files changed, 223 insertions(+), 73 deletions(-) diff --git a/.github/workflows/docker-gdal.yml b/.github/workflows/docker-gdal.yml index e379819b..1ab42ee1 100644 --- a/.github/workflows/docker-gdal.yml +++ b/.github/workflows/docker-gdal.yml @@ -43,7 +43,7 @@ jobs: - name: Install pyarrow # GDAL>=3.6 required to use Arrow API - if: matrix.container != 'osgeo/gdal:ubuntu-small-3.5.3' + if: matrix.container != 'osgeo/gdal:ubuntu-small-3.5.3' && matrix.container != 'osgeo/gdal:ubuntu-small-3.4.3' run: | python3 -m pip install pyarrow diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ab92e99d..2430fcc6 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -110,7 +110,7 @@ jobs: BUILDKIT_PROGRESS: plain - name: Build wheels - uses: pypa/cibuildwheel@v2.14.1 + uses: pypa/cibuildwheel@v2.16.0 - uses: actions/upload-artifact@v3 with: @@ -190,7 +190,7 @@ jobs: path: ${{ matrix.vcpkg_logs }} - name: Build wheels - uses: pypa/cibuildwheel@v2.14.1 + uses: pypa/cibuildwheel@v2.16.0 env: # CIBW needs to know triplet for the correct install path VCPKG_DEFAULT_TRIPLET: ${{ matrix.triplet }} @@ -208,7 +208,7 @@ jobs: fail-fast: false matrix: os: ["ubuntu-20.04", "windows-latest", "macos-11", "macos-latest"] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - name: Checkout @@ -218,6 +218,7 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} + allow-prereleases: true cache: "pip" cache-dependency-path: "ci/requirements-wheel-test.txt" @@ -230,7 +231,9 @@ jobs: shell: bash run: | python -m pip install -r ci/requirements-wheel-test.txt - python -m pip install --no-deps geopandas + if [ ${{ matrix.python-version }} != "3.12" ]; then + python -m pip install --no-deps geopandas + fi python -m pip install --pre --find-links wheelhouse/artifact pyogrio python -m pip list diff --git a/CHANGES.md b/CHANGES.md index 05660082..c32450b8 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -20,11 +20,21 @@ intersect the mask geometry (#285). Note: GDAL < 3.8.0 returns features that intersect the bounding box of the mask when using the Arrow interface for some drivers; this has been fixed in GDAL 3.8.0. +- Removed warning when no features are read from the data source (#299). ### Other changes - test suite requires Shapely >= 2.0 +- using `skip_features` greater than the number of features available in a data + layer now returns empty arrays for `read` and an empty DataFrame for + `read_dataframe` instead of raising a `ValueError` (#282). +- enabled `skip_features` and `max_features` for `read_arrow` and + `read_dataframe(path, use_arrow=True)`. Note that this incurs overhead + because all features up to the next batch size above `max_features` (or size + of data layer) will be read prior to slicing out the requested range of + features (#282). + ### Bug fixes - Fix int32 overflow when reading int64 columns (#260) diff --git a/ci/manylinux2014_x86_64-vcpkg-gdal.Dockerfile b/ci/manylinux2014_x86_64-vcpkg-gdal.Dockerfile index 555df484..b45b6730 100644 --- a/ci/manylinux2014_x86_64-vcpkg-gdal.Dockerfile +++ b/ci/manylinux2014_x86_64-vcpkg-gdal.Dockerfile @@ -1,4 +1,4 @@ -FROM quay.io/pypa/manylinux2014_x86_64:2023-04-24-82a68e6 +FROM quay.io/pypa/manylinux2014_x86_64:2023-09-23-e243937 # building openssl needs IPC-Cmd (https://github.com/microsoft/vcpkg/issues/24988) RUN yum install -y curl unzip zip tar perl-IPC-Cmd diff --git a/ci/manylinux_2_28_aarch64-vcpkg-gdal.Dockerfile b/ci/manylinux_2_28_aarch64-vcpkg-gdal.Dockerfile index 44e51d0d..759b6fa2 100644 --- a/ci/manylinux_2_28_aarch64-vcpkg-gdal.Dockerfile +++ b/ci/manylinux_2_28_aarch64-vcpkg-gdal.Dockerfile @@ -1,4 +1,4 @@ -FROM quay.io/pypa/manylinux_2_28_aarch64:2023-04-16-157f52a +FROM quay.io/pypa/manylinux_2_28_aarch64:2023-09-23-e243937 # building openssl needs IPC-Cmd (https://github.com/microsoft/vcpkg/issues/24988) RUN dnf -y install curl zip unzip tar ninja-build perl-IPC-Cmd diff --git a/ci/requirements-wheel-test.txt b/ci/requirements-wheel-test.txt index b9000e60..c0f172b8 100644 --- a/ci/requirements-wheel-test.txt +++ b/ci/requirements-wheel-test.txt @@ -1,8 +1,8 @@ pytest # dependencies of geopandas (installed separately with --no-deps to avoid fiona) pandas -pyproj -shapely>=2 +pyproj ; (python_version < '3.12') or (python_full_version >= '3.12.1') +shapely>=2 ; (python_version < '3.12') or (python_full_version >= '3.12.1') packaging # optional test dependencies -pyarrow +pyarrow ; (python_version < '3.12') or (python_full_version >= '3.12.1') diff --git a/docs/source/introduction.md b/docs/source/introduction.md index f2d81434..3fd0c324 100644 --- a/docs/source/introduction.md +++ b/docs/source/introduction.md @@ -170,6 +170,16 @@ processes: >>> read_dataframe('ne_10m_admin_0_countries.shp', skip_features=10, max_features=10) ``` +NOTE: if `use_arrow` is `True`, `skip_features` and `max_features` will incur +additional overhead because all features up to the next batch size above +`max_features` (or size of data layer) will be read prior to slicing out the +requested range of features. If `max_features` is less than the maximum Arrow +batch size (65,536 features) only `max_features` will be read. All features +up to `skip_features` are read from the data source and later discarded because +the Arrow interface does not support randomly seeking a starting feature. This +overhead is in comparison to reading via Arrow without these parameters, which +is generally much faster than not using Arrow. + ## Filter records by attribute value You can use the `where` parameter to filter features in layer by attribute values. If diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 75a64547..71e389b5 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -402,7 +402,7 @@ cdef get_total_bounds(OGRLayerH ogr_layer, int force): except CPLE_BaseError: bounds = None - + return bounds @@ -654,13 +654,10 @@ cdef validate_feature_range(OGRLayerH ogr_layer, int skip_features=0, int max_fe num_features = max_features if feature_count == 0: - name = OGR_L_GetName(ogr_layer) - warnings.warn(f"Layer '{name}' does not have any features to read") return 0, 0 - # validate skip_features, max_features if skip_features < 0 or skip_features >= feature_count: - raise ValueError(f"'skip_features' must be between 0 and {feature_count-1}") + skip_features = feature_count if max_features < 0: raise ValueError("'max_features' must be >= 0") @@ -845,6 +842,10 @@ cdef get_features( ] field_data_view = [field_data[field_index][:] for field_index in range(n_fields)] + + if num_features == 0: + return fid_data, geometries, field_data + i = 0 while True: try: @@ -1242,6 +1243,9 @@ def ogr_open_arrow( cdef ArrowArrayStream stream cdef ArrowSchema schema + IF CTE_GDAL_VERSION < (3, 6, 0): + raise RuntimeError("Need GDAL>=3.6 for Arrow support") + path_b = path.encode('utf-8') path_c = path_b @@ -1251,9 +1255,15 @@ def ogr_open_arrow( if fids is not None: raise ValueError("reading by FID is not supported for Arrow") - if skip_features or max_features: + IF CTE_GDAL_VERSION < (3, 8, 0): + if skip_features: + raise ValueError( + "specifying 'skip_features' is not supported for Arrow for GDAL<3.8.0" + ) + + if max_features: raise ValueError( - "specifying 'skip_features' or 'max_features' is not supported for Arrow" + "specifying 'max_features' is not supported for Arrow" ) if sql is not None and layer is not None: @@ -1343,14 +1353,16 @@ def ogr_open_arrow( # make sure layer is read from beginning OGR_L_ResetReading(ogr_layer) - IF CTE_GDAL_VERSION < (3, 6, 0): - raise RuntimeError("Need GDAL>=3.6 for Arrow support") - if not OGR_L_GetArrowStream(ogr_layer, &stream, options): raise RuntimeError("Failed to open ArrowArrayStream from Layer") stream_ptr = &stream + if skip_features: + # only supported for GDAL >= 3.8.0; have to do this after getting + # the Arrow stream + OGR_L_SetNextByIndex(ogr_layer, skip_features) + # stream has to be consumed before the Dataset is closed import pyarrow as pa reader = pa.RecordBatchStreamReader._import_from_c(stream_ptr) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 8566c94b..ab0503c5 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -77,14 +77,14 @@ def read_dataframe( If the geometry has Z values, setting this to True will cause those to be ignored and 2D geometries to be returned skip_features : int, optional (default: 0) - Number of features to skip from the beginning of the file before returning - features. Must be less than the total number of features in the file. - Using this parameter may incur significant overhead if the driver does - not support the capability to randomly seek to a specific feature, - because it will need to iterate over all prior features. + Number of features to skip from the beginning of the file before + returning features. If greater than available number of features, an + empty DataFrame will be returned. Using this parameter may incur + significant overhead if the driver does not support the capability to + randomly seek to a specific feature, because it will need to iterate + over all prior features. max_features : int, optional (default: None) - Number of features to read from the file. Must be less than the total - number of features in the file minus skip_features (if used). + Number of features to read from the file. where : str, optional (default: None) Where clause to filter features in layer by attribute values. If the data source natively supports SQL, its specific SQL dialect should be used (eg. SQLite and diff --git a/pyogrio/raw.py b/pyogrio/raw.py index ba88d29e..a315b553 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -79,14 +79,14 @@ def read( If the geometry has Z values, setting this to True will cause those to be ignored and 2D geometries to be returned skip_features : int, optional (default: 0) - Number of features to skip from the beginning of the file before returning - features. Must be less than the total number of features in the file. - Using this parameter may incur significant overhead if the driver does - not support the capability to randomly seek to a specific feature, - because it will need to iterate over all prior features. + Number of features to skip from the beginning of the file before + returning features. If greater than available number of features, an + empty DataFrame will be returned. Using this parameter may incur + significant overhead if the driver does not support the capability to + randomly seek to a specific feature, because it will need to iterate + over all prior features. max_features : int, optional (default: None) - Number of features to read from the file. Must be less than the total - number of features in the file minus skip_features (if used). + Number of features to read from the file. where : str, optional (default: None) Where clause to filter features in layer by attribute values. If the data source natively supports SQL, its specific SQL dialect should be used (eg. SQLite and @@ -248,6 +248,23 @@ def read_arrow( "geometry_name": "", } """ + from pyarrow import Table + + # limit batch size to max_features if set + if "batch_size" in kwargs: + batch_size = kwargs.pop("batch_size") + else: + batch_size = 65_536 + + if max_features is not None and max_features < batch_size: + batch_size = max_features + + # handle skip_features internally within open_arrow if GDAL >= 3.8.0 + gdal_skip_features = 0 + if get_gdal_version() >= (3, 8, 0): + gdal_skip_features = skip_features + skip_features = 0 + with open_arrow( path_or_buffer, layer=layer, @@ -255,8 +272,6 @@ def read_arrow( columns=columns, read_geometry=read_geometry, force_2d=force_2d, - skip_features=skip_features, - max_features=max_features, where=where, bbox=bbox, mask=mask, @@ -264,10 +279,40 @@ def read_arrow( sql=sql, sql_dialect=sql_dialect, return_fids=return_fids, + skip_features=gdal_skip_features, + batch_size=batch_size, **kwargs, ) as source: meta, reader = source - table = reader.read_all() + + if max_features is not None: + batches = [] + count = 0 + while True: + try: + batch = reader.read_next_batch() + batches.append(batch) + + count += len(batch) + if count >= (skip_features + max_features): + break + + except StopIteration: + break + + # use combine_chunks to release the original memory that included + # too many features + table = ( + Table.from_batches(batches, schema=reader.schema) + .slice(skip_features, max_features) + .combine_chunks() + ) + + elif skip_features > 0: + table = reader.read_all().slice(skip_features).combine_chunks() + + else: + table = reader.read_all() return meta, table diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py index 6d55123b..5f9f6660 100644 --- a/pyogrio/tests/test_arrow.py +++ b/pyogrio/tests/test_arrow.py @@ -2,7 +2,7 @@ import pytest -from pyogrio import read_dataframe +from pyogrio import __gdal_version__, read_dataframe from pyogrio.raw import open_arrow, read_arrow from pyogrio.tests.conftest import requires_arrow_api @@ -31,6 +31,41 @@ def test_read_arrow(naturalearth_lowres_all_ext): assert_geodataframe_equal(result, expected, check_less_precise=check_less_precise) +@pytest.mark.parametrize("skip_features, expected", [(10, 167), (200, 0)]) +def test_read_arrow_skip_features(naturalearth_lowres, skip_features, expected): + table = read_arrow(naturalearth_lowres, skip_features=skip_features)[1] + assert len(table) == expected + + +@pytest.mark.parametrize( + "max_features, expected", [(0, 0), (10, 10), (200, 177), (100000, 177)] +) +def test_read_arrow_max_features(naturalearth_lowres, max_features, expected): + table = read_arrow(naturalearth_lowres, max_features=max_features)[1] + assert len(table) == expected + + +@pytest.mark.parametrize( + "skip_features, max_features, expected", + [ + (0, 0, 0), + (10, 0, 0), + (200, 0, 0), + (1, 200, 176), + (176, 10, 1), + (100, 100, 77), + (100, 100000, 77), + ], +) +def test_read_arrow_skip_features_max_features( + naturalearth_lowres, skip_features, max_features, expected +): + table = read_arrow( + naturalearth_lowres, skip_features=skip_features, max_features=max_features + )[1] + assert len(table) == expected + + def test_read_arrow_fid(naturalearth_lowres_all_ext): kwargs = {"use_arrow": True, "where": "fid >= 2 AND fid <= 3"} @@ -91,3 +126,36 @@ def test_open_arrow_batch_size(naturalearth_lowres): assert count == 2, "Should be two batches given the batch_size parameter" assert len(tables[0]) == batch_size, "First table should match the batch size" + + +@pytest.mark.skipif( + __gdal_version__ >= (3, 8, 0), + reason="skip_features supported by Arrow stream API for GDAL>=3.8.0", +) +@pytest.mark.parametrize("skip_features", [10, 200]) +def test_open_arrow_skip_features_unsupported(naturalearth_lowres, skip_features): + """skip_features are not supported for the Arrow stream interface for + GDAL < 3.8.0""" + with pytest.raises( + ValueError, + match="specifying 'skip_features' is not supported for Arrow for GDAL<3.8.0", + ): + with open_arrow(naturalearth_lowres, skip_features=skip_features) as ( + meta, + reader, + ): + pass + + +@pytest.mark.parametrize("max_features", [10, 200]) +def test_open_arrow_max_features_unsupported(naturalearth_lowres, max_features): + """max_features are not supported for the Arrow stream interface""" + with pytest.raises( + ValueError, + match="specifying 'max_features' is not supported for Arrow", + ): + with open_arrow(naturalearth_lowres, max_features=max_features) as ( + meta, + reader, + ): + pass diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py index c65d4739..0f7536b9 100644 --- a/pyogrio/tests/test_core.py +++ b/pyogrio/tests/test_core.py @@ -220,10 +220,9 @@ def test_read_bounds_bbox_invalid(naturalearth_lowres, bbox): def test_read_bounds_bbox(naturalearth_lowres_all_ext): # should return no features - with pytest.warns(UserWarning, match="does not have any features to read"): - fids, bounds = read_bounds( - naturalearth_lowres_all_ext, bbox=(0, 0, 0.00001, 0.00001) - ) + fids, bounds = read_bounds( + naturalearth_lowres_all_ext, bbox=(0, 0, 0.00001, 0.00001) + ) assert fids.shape == (0,) assert bounds.shape == (4, 0) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index fa6c553a..21af2e63 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -220,7 +220,6 @@ def test_read_fid_as_index_only(naturalearth_lowres, use_arrow): assert len(df.columns) == 0 -@pytest.mark.filterwarnings("ignore:.*Layer .* does not have any features to read") def test_read_where(naturalearth_lowres_all_ext): # empty filter should return full set of records df = read_dataframe(naturalearth_lowres_all_ext, where="") @@ -250,7 +249,6 @@ def test_read_where(naturalearth_lowres_all_ext): assert len(df) == 0 -@pytest.mark.filterwarnings("ignore:.*Layer .* does not have any features to read") def test_read_where_invalid(naturalearth_lowres_all_ext): with pytest.raises(ValueError, match="Invalid SQL"): read_dataframe(naturalearth_lowres_all_ext, where="invalid") @@ -278,15 +276,7 @@ def test_read_bbox(naturalearth_lowres_all_ext, use_arrow, bbox, expected): ): pytest.xfail(reason="GDAL bug: https://github.com/OSGeo/gdal/issues/8347") - if len(expected) == 0 and not use_arrow: - # should return no features - with pytest.warns(UserWarning, match="does not have any features to read"): - df = read_dataframe( - naturalearth_lowres_all_ext, use_arrow=use_arrow, bbox=bbox - ) - - else: - df = read_dataframe(naturalearth_lowres_all_ext, use_arrow=use_arrow, bbox=bbox) + df = read_dataframe(naturalearth_lowres_all_ext, use_arrow=use_arrow, bbox=bbox) assert np.array_equal(df.iso_a3, expected) @@ -424,6 +414,17 @@ def test_read_fids_force_2d(test_fgdb_vsi): assert not df.iloc[0].geometry.has_z +@pytest.mark.parametrize("skip_features, expected", [(10, 167), (200, 0)]) +def test_read_skip_features( + naturalearth_lowres_all_ext, use_arrow, skip_features, expected +): + df = read_dataframe( + naturalearth_lowres_all_ext, skip_features=skip_features, use_arrow=use_arrow + ) + assert len(df) == expected + assert isinstance(df, gp.GeoDataFrame) + + def test_read_non_existent_file(): # ensure consistent error type / message from GDAL with pytest.raises(DataSourceError, match="No such file or directory"): @@ -436,7 +437,6 @@ def test_read_non_existent_file(): read_dataframe("zip:///non-existent.zip") -@pytest.mark.filterwarnings("ignore:.*Layer .* does not have any features to read") def test_read_sql(naturalearth_lowres_all_ext): # The geometry column cannot be specified when using the # default OGRSQL dialect but is returned nonetheless, so 4 columns. @@ -551,10 +551,10 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext): assert len(df) == 1 sql = "SELECT * FROM naturalearth_lowres LIMIT 1" - with pytest.raises(ValueError, match="'skip_features' must be between 0 and 0"): - _ = read_dataframe( - naturalearth_lowres_all_ext, sql=sql, skip_features=1, sql_dialect="OGRSQL" - ) + df = read_dataframe( + naturalearth_lowres_all_ext, sql=sql, skip_features=1, sql_dialect="OGRSQL" + ) + assert len(df) == 0 @requires_gdal_geos @@ -690,7 +690,6 @@ def test_write_dataframe_no_geom(tmp_path, naturalearth_lowres, ext): ) -@pytest.mark.filterwarnings("ignore:.*Layer .* does not have any features to read") @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext not in ".geojsonl"]) def test_write_empty_dataframe(tmp_path, ext): expected = gp.GeoDataFrame(geometry=[], crs=4326) diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 99d06c79..fe86b249 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -134,16 +134,22 @@ def test_read_columns(naturalearth_lowres): array_equal(meta["fields"], columns[:2]) -def test_read_skip_features(naturalearth_lowres): - expected_geometry, expected_fields = read(naturalearth_lowres)[2:] - geometry, fields = read(naturalearth_lowres, skip_features=10)[2:] +@pytest.mark.parametrize("skip_features", [10, 200]) +def test_read_skip_features(naturalearth_lowres_all_ext, skip_features): + expected_geometry, expected_fields = read(naturalearth_lowres_all_ext)[2:] + geometry, fields = read(naturalearth_lowres_all_ext, skip_features=skip_features)[ + 2: + ] + + # skipping more features than available in layer returns empty arrays + expected_count = max(len(expected_geometry) - skip_features, 0) - assert len(geometry) == len(expected_geometry) - 10 - assert len(fields[0]) == len(expected_fields[0]) - 10 + assert len(geometry) == expected_count + assert len(fields[0]) == expected_count - assert np.array_equal(geometry, expected_geometry[10:]) + assert np.array_equal(geometry, expected_geometry[skip_features:]) # Last field has more variable data - assert np.array_equal(fields[-1], expected_fields[-1][10:]) + assert np.array_equal(fields[-1], expected_fields[-1][skip_features:]) def test_read_max_features(naturalearth_lowres): @@ -180,9 +186,8 @@ def test_read_where(naturalearth_lowres): assert max(fields[0]) < 100000000 # should match no items - with pytest.warns(UserWarning, match="does not have any features to read"): - geometry, fields = read(naturalearth_lowres, where="iso_a3 = 'INVALID'")[2:] - assert len(geometry) == 0 + geometry, fields = read(naturalearth_lowres, where="iso_a3 = 'INVALID'")[2:] + assert len(geometry) == 0 def test_read_where_invalid(naturalearth_lowres): @@ -198,10 +203,9 @@ def test_read_bbox_invalid(naturalearth_lowres, bbox): def test_read_bbox(naturalearth_lowres_all_ext): # should return no features - with pytest.warns(UserWarning, match="does not have any features to read"): - geometry, fields = read( - naturalearth_lowres_all_ext, bbox=(0, 0, 0.00001, 0.00001) - )[2:] + geometry, fields = read(naturalearth_lowres_all_ext, bbox=(0, 0, 0.00001, 0.00001))[ + 2: + ] assert len(geometry) == 0