diff --git a/CHANGES.md b/CHANGES.md index 149ae09b..f5e55300 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -23,6 +23,7 @@ the previous behaviour of returning a `pyarrow.RecordBatchReader`, specify `use_pyarrow=True` (#349). - Warn when reading from a multilayer file without specifying a layer (#362). +- Allow writing to a new in-memory datasource using io.BytesIO object (#397). ### Bug fixes diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 4a5da6db..16bc8fa1 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -6,16 +6,19 @@ import contextlib import datetime +from io import BytesIO import locale import logging import math import os +from pathlib import Path import sys +from uuid import uuid4 import warnings from libc.stdint cimport uint8_t, uintptr_t from libc.stdlib cimport malloc, free -from libc.string cimport strlen +from libc.string cimport memcpy, strlen from libc.math cimport isnan from cpython.pycapsule cimport PyCapsule_GetPointer @@ -29,7 +32,7 @@ from pyogrio._err cimport * from pyogrio._err import CPLE_BaseError, CPLE_NotSupportedError, NullPointerError from pyogrio._geometry cimport get_geometry_type, get_geometry_type_code from pyogrio.errors import CRSError, DataSourceError, DataLayerError, GeometryError, FieldError, FeatureError - +from pyogrio._ogr import _get_driver_metadata_item log = logging.getLogger(__name__) @@ -173,6 +176,17 @@ cdef const char* override_threadlocal_config_option(str key, str value): cdef void* ogr_open(const char* path_c, int mode, char** options) except NULL: + """Open an existing OGR data source + + Parameters + ---------- + path_c : char * + input path, including an in-memory path (/vsimem/...) + mode : int + set to 1 to allow updating data source + options : char **, optional + dataset open options + """ cdef void* ogr_dataset = NULL # Force linear approximations in all cases @@ -1976,12 +1990,105 @@ cdef infer_field_types(list dtypes): return field_types +cdef str get_ogr_vsimem_write_path(object path_or_fp, str driver): + """ Return the original path or a /vsimem/ path + + If passed a io.BytesIO object, this will return a /vsimem/ path that can be + used to create a new in-memory file with an extension inferred from the driver + if possible. Path will be contained in an in-memory directory to contain + sibling files (though drivers that create sibling files are not supported for + in-memory files). + + Caller is responsible for deleting the directory via delete_vsimem_file() + + Parameters + ---------- + path_or_fp : str or io.BytesIO object + driver : str + """ + + if not isinstance(path_or_fp, BytesIO): + return path_or_fp + + # Create in-memory directory to contain auxiliary files + memfilename = uuid4().hex + VSIMkdir(f"/vsimem/{memfilename}".encode("utf-8"), 0666) + + # file extension is required for some drivers, set it based on driver metadata + ext = '' + recommended_ext = _get_driver_metadata_item(driver, "DMD_EXTENSIONS") + if recommended_ext is not None: + ext = "." + recommended_ext.split(' ')[0] + + path = f"/vsimem/{memfilename}/{memfilename}{ext}" + + # check for existing bytes + if path_or_fp.getbuffer().nbytes > 0: + raise NotImplementedError("writing to existing in-memory object is not supported") + + return path + + +cdef read_vsimem_to_buffer(str path, object out_buffer): + """Copy bytes from in-memory file to buffer + + This will automatically unlink the in-memory file pointed to by path; caller + is still responsible for calling delete_vsimem_file() to cleanup any other + files contained in the in-memory directory. + + Parameters: + ----------- + path : str + path to in-memory file + buffer : BytesIO object + """ + + cdef unsigned char *vsi_buffer = NULL + cdef vsi_l_offset vsi_buffer_size = 0 + + try: + # Take ownership of the buffer to avoid a copy; GDAL will automatically + # unlink the memory file + vsi_buffer = VSIGetMemFileBuffer(path.encode("UTF-8"), &vsi_buffer_size, 1) + if vsi_buffer == NULL: + raise RuntimeError("could not read bytes from in-memory file") + + # write bytes to buffer + out_buffer.write(vsi_buffer[:vsi_buffer_size]) + # rewind to beginning to allow caller to read + out_buffer.seek(0) + + finally: + if vsi_buffer != NULL: + CPLFree(vsi_buffer) + + +cdef delete_vsimem_file(str path): + """ Delete in-memory directory containing path + + Parameters: + ----------- + path : str + path to in-memory file + """ + VSIRmdirRecursive(str(Path(path).parent).encode("UTF-8")) + cdef create_ogr_dataset_layer( - str path, str layer, str driver, str crs, str geometry_type, str encoding, - object dataset_kwargs, object layer_kwargs, bint append, - dataset_metadata, layer_metadata, - OGRDataSourceH* ogr_dataset_out, OGRLayerH* ogr_layer_out, + str path, + bint is_vsi, + str layer, + str driver, + str crs, + str geometry_type, + str encoding, + object dataset_kwargs, + object layer_kwargs, + bint append, + dataset_metadata, + layer_metadata, + OGRDataSourceH* ogr_dataset_out, + OGRLayerH* ogr_layer_out, ): """ Construct the OGRDataSource and OGRLayer objects based on input @@ -2030,18 +2137,22 @@ cdef create_ogr_dataset_layer( driver_b = driver.encode('UTF-8') driver_c = driver_b + # in-memory dataset is always created from scratch + path_exists = os.path.exists(path) if not is_vsi else False + if not layer: layer = os.path.splitext(os.path.split(path)[1])[0] # if shapefile, GeoJSON, or FlatGeobuf, always delete first # for other types, check if we can create layers # GPKG might be the only multi-layer writeable type. TODO: check this - if driver in ('ESRI Shapefile', 'GeoJSON', 'GeoJSONSeq', 'FlatGeobuf') and os.path.exists(path): + if driver in ('ESRI Shapefile', 'GeoJSON', 'GeoJSONSeq', 'FlatGeobuf') and path_exists: if not append: os.unlink(path) + path_exists = False layer_exists = False - if os.path.exists(path): + if path_exists: try: ogr_dataset = ogr_open(path_c, 1, NULL) @@ -2063,7 +2174,11 @@ cdef create_ogr_dataset_layer( raise exc # otherwise create from scratch - os.unlink(path) + if is_vsi: + VSIUnlink(path_c) + else: + os.unlink(path) + ogr_dataset = NULL # either it didn't exist or could not open it in write mode @@ -2154,15 +2269,29 @@ cdef create_ogr_dataset_layer( ogr_dataset_out[0] = ogr_dataset ogr_layer_out[0] = ogr_layer + return create_layer # TODO: set geometry and field data as memory views? def ogr_write( - str path, str layer, str driver, geometry, fields, field_data, field_mask, - str crs, str geometry_type, str encoding, object dataset_kwargs, - object layer_kwargs, bint promote_to_multi=False, bint nan_as_null=True, - bint append=False, dataset_metadata=None, layer_metadata=None, + object path_or_fp, + str layer, + str driver, + geometry, + fields, + field_data, + field_mask, + str crs, + str geometry_type, + str encoding, + object dataset_kwargs, + object layer_kwargs, + bint promote_to_multi=False, + bint nan_as_null=True, + bint append=False, + dataset_metadata=None, + layer_metadata=None, gdal_tz_offsets=None ): cdef OGRDataSourceH ogr_dataset = NULL @@ -2179,6 +2308,7 @@ def ogr_write( cdef int num_records = -1 cdef int num_field_data = len(field_data) if field_data is not None else 0 cdef int num_fields = len(fields) if fields is not None else 0 + cdef bint is_vsi = False if num_fields != num_field_data: raise ValueError("field_data array needs to be same length as fields array") @@ -2218,9 +2348,13 @@ def ogr_write( gdal_tz_offsets = {} try: - ### Setup up dataset and layer + # Setup in-memory handler if needed + path = get_ogr_vsimem_write_path(path_or_fp, driver) + is_vsi = path.startswith('/vsimem/') + + # Setup dataset and layer layer_created = create_ogr_dataset_layer( - path, layer, driver, crs, geometry_type, encoding, + path, is_vsi, layer, driver, crs, geometry_type, encoding, dataset_kwargs, layer_kwargs, append, dataset_metadata, layer_metadata, &ogr_dataset, &ogr_layer, @@ -2418,6 +2552,16 @@ def ogr_write( log.info(f"Created {num_records:,} records" ) + # close dataset to force driver to flush data + exc = ogr_close(ogr_dataset) + ogr_dataset = NULL + if exc: + raise DataSourceError(f"Failed to write features to dataset {path}; {exc}") + + # copy in-memory file back to path_or_fp object + if is_vsi: + read_vsimem_to_buffer(path, path_or_fp) + finally: ### Final cleanup # make sure that all objects allocated above are released if exceptions @@ -2434,13 +2578,15 @@ def ogr_write( OGR_G_DestroyGeometry(ogr_geometry) ogr_geometry = NULL - exc = ogr_close(ogr_dataset) - if exc: - raise DataSourceError(f"Failed to write features to dataset {path}; {exc}") + if ogr_dataset != NULL: + ogr_close(ogr_dataset) + + if is_vsi: + delete_vsimem_file(path) def ogr_write_arrow( - str path, + object path_or_fp, str layer, str driver, object arrow_obj, @@ -2460,6 +2606,7 @@ def ogr_write_arrow( cdef OGRDataSourceH ogr_dataset = NULL cdef OGRLayerH ogr_layer = NULL cdef char **options = NULL + cdef bint is_vsi = False cdef ArrowArrayStream* stream = NULL cdef ArrowSchema schema cdef ArrowArray array @@ -2468,8 +2615,11 @@ def ogr_write_arrow( array.release = NULL try: + path = get_ogr_vsimem_write_path(path_or_fp, driver) + is_vsi = path.startswith('/vsimem/') + layer_created = create_ogr_dataset_layer( - path, layer, driver, crs, geometry_type, encoding, + path, is_vsi, layer, driver, crs, geometry_type, encoding, dataset_kwargs, layer_kwargs, append, dataset_metadata, layer_metadata, &ogr_dataset, &ogr_layer, @@ -2523,6 +2673,16 @@ def ogr_write_arrow( if array.release != NULL: array.release(&array) + # close dataset to force driver to flush data + exc = ogr_close(ogr_dataset) + ogr_dataset = NULL + if exc: + raise DataSourceError(f"Failed to write features to dataset {path}; {exc}") + + # copy in-memory file back to path_or_fp object + if is_vsi: + read_vsimem_to_buffer(path, path_or_fp) + finally: if stream != NULL and stream.release != NULL: stream.release(stream) @@ -2537,9 +2697,11 @@ def ogr_write_arrow( CSLDestroy(options) options = NULL - exc = ogr_close(ogr_dataset) - if exc: - raise DataSourceError(f"Failed to write features to dataset {path}; {exc}") + if ogr_dataset != NULL: + ogr_close(ogr_dataset) + + if is_vsi: + delete_vsimem_file(path) cdef get_arrow_extension_metadata(const ArrowSchema* schema): diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd index 03b376a1..9c53c5c8 100644 --- a/pyogrio/_ogr.pxd +++ b/pyogrio/_ogr.pxd @@ -45,13 +45,23 @@ cdef extern from "cpl_string.h": cdef extern from "cpl_vsi.h" nogil: - + int VSI_STAT_EXISTS_FLAG + ctypedef int vsi_l_offset ctypedef FILE VSILFILE + ctypedef struct VSIStatBufL: + long st_size + long st_mode + int st_mtime + + int VSIFCloseL(VSILFILE *fp) + int VSIFFlushL(VSILFILE *fp) + int VSIUnlink(const char *path) + + VSILFILE *VSIFileFromMemBuffer(const char *path, void *data, vsi_l_offset data_len, int take_ownership) + unsigned char *VSIGetMemFileBuffer(const char *path, vsi_l_offset *data_len, int take_ownership) - VSILFILE *VSIFileFromMemBuffer(const char *path, void *data, - int data_len, int take_ownership) - int VSIFCloseL(VSILFILE *fp) - int VSIUnlink(const char *path) + int VSIMkdir(const char *path, long mode) + int VSIRmdirRecursive(const char *pszDirname) cdef extern from "ogr_core.h": diff --git a/pyogrio/_ogr.pyx b/pyogrio/_ogr.pyx index 55d19080..6bcd79f5 100644 --- a/pyogrio/_ogr.pyx +++ b/pyogrio/_ogr.pyx @@ -108,6 +108,14 @@ def ogr_driver_supports_write(driver): return False +def ogr_driver_supports_vsi(driver): + # check metadata for driver to see if it supports write + if _get_driver_metadata_item(driver, "DCAP_VIRTUALIO") == 'YES': + return True + + return False + + def ogr_list_drivers(): cdef OGRSFDriverH driver = NULL cdef int i diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 4c9cdf47..2b1bf654 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -6,10 +6,10 @@ from pyogrio.raw import ( DRIVERS_NO_MIXED_SINGLE_MULTI, DRIVERS_NO_MIXED_DIMENSIONS, - detect_write_driver, read, read_arrow, write, + _get_write_path_driver, ) from pyogrio.errors import DataSourceError import warnings @@ -349,13 +349,16 @@ def write_dataframe( all values will be converted to strings to be written to the output file, except None and np.nan, which will be set to NULL in the output file. - path : str - path to file + path : str or io.BytesIO + path to output file on writeable file system or an io.BytesIO object to + allow writing to memory + NOTE: support for writing to memory is limited to specific drivers. layer : str, optional (default: None) - layer name + layer name to create. If writing to memory and layer name is not + provided, it layer name will be set to a UUID4 value. driver : string, optional (default: None) - The OGR format driver used to write the vector file. By default write_dataframe - attempts to infer driver from path. + The OGR format driver used to write the vector file. By default attempts + to infer driver from path. Must be provided to write to memory. encoding : str, optional (default: None) If present, will be used as the encoding for writing string values to the file. Use with caution, only certain drivers support encodings @@ -391,7 +394,8 @@ def write_dataframe( append : bool, optional (default: False) If True, the data source specified by path already exists, and the driver supports appending to an existing data source, will cause the - data to be appended to the existing records in the data source. + data to be appended to the existing records in the data source. Not + supported for writing to in-memory files. NOTE: append support is limited to specific drivers and GDAL versions. use_arrow : bool, optional (default: False) Whether to use Arrow as the transfer mechanism of the data to write @@ -436,16 +440,12 @@ def write_dataframe( import pandas as pd from pyproj.enums import WktVersion # if geopandas is available so is pyproj - path = str(path) - if not isinstance(df, pd.DataFrame): raise ValueError("'df' must be a DataFrame or GeoDataFrame") if use_arrow is None: use_arrow = bool(int(os.environ.get("PYOGRIO_USE_ARROW", "0"))) - - if driver is None: - driver = detect_write_driver(path) + path, driver = _get_write_path_driver(path, driver, append=append) geometry_columns = df.columns[df.dtypes == "geometry"] if len(geometry_columns) > 1: diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 948f3a95..d365e46c 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -1,3 +1,4 @@ +from io import BytesIO import warnings from pyogrio._env import GDALEnv @@ -18,6 +19,7 @@ get_gdal_version, get_gdal_version_string, ogr_driver_supports_write, + ogr_driver_supports_vsi, remove_virtual_file, ) @@ -527,6 +529,67 @@ def _preprocess_options_kwargs(driver, dataset_options, layer_options, kwargs): return dataset_kwargs, layer_kwargs +def _get_write_path_driver(path, driver, append=False): + """Validate and return path and driver + + Parameters + ---------- + path : str or io.BytesIO + path to output file on writeable file system or an io.BytesIO object to + allow writing to memory + driver : str, optional (default: None) + The OGR format driver used to write the vector file. By default attempts + to infer driver from path. Must be provided to write to a file-like + object. + append : bool, optional (default: False) + True if path and driver is being tested for append support + + Returns + ------- + (path, driver) + """ + + if isinstance(path, BytesIO): + if driver is None: + raise ValueError("driver must be provided to write to in-memory file") + + # blacklist certain drivers known not to work in current memory implementation + # because they create multiple files + if driver in {"ESRI Shapefile", "OpenFileGDB"}: + raise ValueError(f"writing to in-memory file is not supported for {driver}") + + # verify that driver supports VSI methods + if not ogr_driver_supports_vsi(driver): + raise DataSourceError( + f"{driver} does not support ability to write in-memory in GDAL " + f"{get_gdal_version_string()}" + ) + + if append: + raise NotImplementedError("append is not supported for in-memory files") + + else: + path = vsi_path(str(path)) + + if driver is None: + driver = detect_write_driver(path) + + # verify that driver supports writing + if not ogr_driver_supports_write(driver): + raise DataSourceError( + f"{driver} does not support write functionality in GDAL " + f"{get_gdal_version_string()}" + ) + + # prevent segfault from: https://github.com/OSGeo/gdal/issues/5739 + if append and driver == "FlatGeobuf" and get_gdal_version() <= (3, 5, 0): + raise RuntimeError( + "append to FlatGeobuf is not supported for GDAL <= 3.5.0 due to segfault" + ) + + return path, driver + + def write( path, geometry, @@ -550,26 +613,90 @@ def write( gdal_tz_offsets=None, **kwargs, ): + """Write geometry and field data to an OGR file format. + + Parameters + ---------- + path : str or io.BytesIO + path to output file on writeable file system or an io.BytesIO object to + allow writing to memory + NOTE: support for writing to memory is limited to specific drivers. + geometry : ndarray of WKB encoded geometries or None + If None, geometries will not be written to output file + field_data : list-like of shape (num_fields, num_records) + contains one record per field to be written in same order as fields + fields : list-like + contains field names + field_mask : list-like of ndarrays or None, optional (default: None) + contains mask arrays indicating null values of the field at the same + position in the outer list, or None to indicate field does not have + a mask array + layer : str, optional (default: None) + layer name to create. If writing to memory and layer name is not + provided, it layer name will be set to a UUID4 value. + driver : string, optional (default: None) + The OGR format driver used to write the vector file. By default attempts + to infer driver from path. Must be provided to write to memory. + geometry_type : str, optional (default: None) + Possible values are: "Unknown", "Point", "LineString", "Polygon", + "MultiPoint", "MultiLineString", "MultiPolygon" or "GeometryCollection". + + This parameter does not modify the geometry, but it will try to force + the layer type of the output file to this value. Use this parameter with + caution because using a wrong layer geometry type may result in errors + when writing the file, may be ignored by the driver, or may result in + invalid files. + crs : str, optional (default: None) + WKT-encoded CRS of the geometries to be written. + encoding : str, optional (default: None) + If present, will be used as the encoding for writing string values to + the file. Use with caution, only certain drivers support encodings + other than UTF-8. + promote_to_multi : bool, optional (default: None) + If True, will convert singular geometry types in the data to their + corresponding multi geometry type for writing. By default, will convert + mixed singular and multi geometry types to multi geometry types for + drivers that do not support mixed singular and multi geometry types. If + False, geometry types will not be promoted, which may result in errors + or invalid files when attempting to write mixed singular and multi + geometry types to drivers that do not support such combinations. + nan_as_null : bool, default True + For floating point columns (float32 / float64), whether NaN values are + written as "null" (missing value). Defaults to True because in pandas + NaNs are typically used as missing value. Note that when set to False, + behaviour is format specific: some formats don't support NaNs by + default (e.g. GeoJSON will skip this property) or might treat them as + null anyway (e.g. GeoPackage). + append : bool, optional (default: False) + If True, the data source specified by path already exists, and the + driver supports appending to an existing data source, will cause the + data to be appended to the existing records in the data source. Not + supported for writing to in-memory files. + NOTE: append support is limited to specific drivers and GDAL versions. + dataset_metadata : dict, optional (default: None) + Metadata to be stored at the dataset level in the output file; limited + to drivers that support writing metadata, such as GPKG, and silently + ignored otherwise. Keys and values must be strings. + layer_metadata : dict, optional (default: None) + Metadata to be stored at the layer level in the output file; limited to + drivers that support writing metadata, such as GPKG, and silently + ignored otherwise. Keys and values must be strings. + metadata : dict, optional (default: None) + alias of layer_metadata + dataset_options : dict, optional + Dataset creation options (format specific) passed to OGR. Specify as + a key-value dictionary. + layer_options : dict, optional + Layer creation options (format specific) passed to OGR. Specify as + a key-value dictionary. + gdal_tz_offsets : dict, optional (default: None) + Used to handle GDAL timezone offsets for each field contained in dict. + """ # if dtypes is given, remove it from kwargs (dtypes is included in meta returned by # read, and it is convenient to pass meta directly into write for round trip tests) kwargs.pop("dtypes", None) - path = vsi_path(str(path)) - - if driver is None: - driver = detect_write_driver(path) - # verify that driver supports writing - if not ogr_driver_supports_write(driver): - raise DataSourceError( - f"{driver} does not support write functionality in GDAL " - f"{get_gdal_version_string()}" - ) - - # prevent segfault from: https://github.com/OSGeo/gdal/issues/5739 - if append and driver == "FlatGeobuf" and get_gdal_version() <= (3, 5, 0): - raise RuntimeError( - "append to FlatGeobuf is not supported for GDAL <= 3.5.0 due to segfault" - ) + path, driver = _get_write_path_driver(path, driver, append=append) dataset_metadata, layer_metadata = _validate_metadata( dataset_metadata, layer_metadata, metadata @@ -644,13 +771,16 @@ def write_arrow( object that implements the `Arrow PyCapsule Protocol`_ (i.e. has an ``__arrow_c_stream__`` method), for example a pyarrow Table or RecordBatchReader. - path : str - Path to file. + path : str or io.BytesIO + path to output file on writeable file system or an io.BytesIO object to + allow writing to memory + NOTE: support for writing to memory is limited to specific drivers. layer : str, optional (default: None) - layer name - driver : str, optional (default: None) - The OGR format driver used to write the vector file. By default write_arrow - attempts to infer driver from path. + layer name to create. If writing to memory and layer name is not + provided, it layer name will be set to a UUID4 value. + driver : string, optional (default: None) + The OGR format driver used to write the vector file. By default attempts + to infer driver from path. Must be provided to write to memory. geometry_name : str, optional (default: None) The name of the column in the input data that will be written as the geometry field. Will be inferred from the input data if the geometry @@ -672,7 +802,8 @@ def write_arrow( append : bool, optional (default: False) If True, the data source specified by path already exists, and the driver supports appending to an existing data source, will cause the - data to be appended to the existing records in the data source. + data to be appended to the existing records in the data source. Not + supported for writing to in-memory files. NOTE: append support is limited to specific drivers and GDAL versions. dataset_metadata : dict, optional (default: None) Metadata to be stored at the dataset level in the output file; limited @@ -710,17 +841,7 @@ def write_arrow( "'__arrow_c_stream__' method)." ) - path = vsi_path(str(path)) - - if driver is None: - driver = detect_write_driver(path) - - # verify that driver supports writing - if not ogr_driver_supports_write(driver): - raise DataSourceError( - f"{driver} does not support write functionality in GDAL " - f"{get_gdal_version_string()}" - ) + path, driver = _get_write_path_driver(path, driver, append=append) if "promote_to_multi" in kwargs: raise ValueError( diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py index d0773a03..e5456fd8 100644 --- a/pyogrio/tests/conftest.py +++ b/pyogrio/tests/conftest.py @@ -114,7 +114,7 @@ def naturalearth_lowres_vsi(tmp_path, naturalearth_lowres): path = tmp_path / f"{naturalearth_lowres.name}.zip" with ZipFile(path, mode="w", compression=ZIP_DEFLATED, compresslevel=5) as out: - for ext in ["dbf", "prj", "shp", "shx"]: + for ext in ["dbf", "prj", "shp", "shx", "cpg"]: filename = f"{naturalearth_lowres.stem}.{ext}" out.write(naturalearth_lowres.parent / filename, filename) diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py index 3ea4610b..db2f7b2f 100644 --- a/pyogrio/tests/test_arrow.py +++ b/pyogrio/tests/test_arrow.py @@ -1,4 +1,5 @@ import contextlib +from io import BytesIO import json import math import os @@ -155,6 +156,11 @@ def test_read_arrow_raw(naturalearth_lowres): assert isinstance(table, pyarrow.Table) +def test_read_arrow_vsi(naturalearth_lowres_vsi): + table = read_arrow(naturalearth_lowres_vsi[1])[1] + assert len(table) == 177 + + def test_open_arrow_pyarrow(naturalearth_lowres): with open_arrow(naturalearth_lowres, use_pyarrow=True) as (meta, reader): assert isinstance(meta, dict) @@ -802,6 +808,120 @@ def test_write_schema_error_message(tmpdir): ) +@requires_arrow_write_api +@pytest.mark.filterwarnings("ignore:File /vsimem:RuntimeWarning") +@pytest.mark.parametrize("driver", ["GeoJSON", "GPKG"]) +def test_write_memory(naturalearth_lowres, driver): + meta, table = read_arrow(naturalearth_lowres) + meta["geometry_type"] = "MultiPolygon" + + buffer = BytesIO() + write_arrow( + table.slice(0, 1), + buffer, + driver=driver, + layer="test", + crs=meta["crs"], + geometry_type=meta["geometry_type"], + geometry_name=meta["geometry_name"] or "wkb_geometry", + ) + + assert len(buffer.getbuffer()) > 0 + assert list_layers(buffer)[0][0] == "test" + + # TODO: enable; not yet working via Arrow + # actual_meta, actual_table = read_arrow(buffer) + # assert len(actual_table) == len(table) + # assert np.array_equal(actual_meta["fields"], meta["fields"]) + + +@requires_arrow_write_api +def test_write_memory_driver_required(naturalearth_lowres): + meta, table = read_arrow(naturalearth_lowres) + + buffer = BytesIO() + with pytest.raises( + ValueError, + match="driver must be provided to write to in-memory file", + ): + write_arrow( + table.slice(0, 1), + buffer, + driver=None, + layer="test", + crs=meta["crs"], + geometry_type=meta["geometry_type"], + geometry_name=meta["geometry_name"] or "wkb_geometry", + ) + + +@requires_arrow_write_api +@pytest.mark.parametrize("driver", ["ESRI Shapefile", "OpenFileGDB"]) +def test_write_memory_unsupported_driver(naturalearth_lowres, driver): + if driver == "OpenFileGDB" and __gdal_version__ < (3, 6, 0): + pytest.skip("OpenFileGDB write support only available for GDAL >= 3.6.0") + + meta, table = read_arrow(naturalearth_lowres) + + buffer = BytesIO() + + with pytest.raises( + ValueError, match=f"writing to in-memory file is not supported for {driver}" + ): + write_arrow( + table.slice(0, 1), + buffer, + driver=driver, + layer="test", + crs=meta["crs"], + geometry_type=meta["geometry_type"], + geometry_name=meta["geometry_name"] or "wkb_geometry", + ) + + +@requires_arrow_write_api +@pytest.mark.parametrize("driver", ["GeoJSON", "GPKG"]) +def test_write_memory_append_unsupported(naturalearth_lowres, driver): + meta, table = read_arrow(naturalearth_lowres) + meta["geometry_type"] = "MultiPolygon" + + buffer = BytesIO() + with pytest.raises( + NotImplementedError, match="append is not supported for in-memory files" + ): + write_arrow( + table.slice(0, 1), + buffer, + driver=driver, + layer="test", + crs=meta["crs"], + geometry_type=meta["geometry_type"], + geometry_name=meta["geometry_name"] or "wkb_geometry", + append=True, + ) + + +@requires_arrow_write_api +def test_write_memory_existing_unsupported(naturalearth_lowres): + meta, table = read_arrow(naturalearth_lowres) + meta["geometry_type"] = "MultiPolygon" + + buffer = BytesIO(b"0000") + with pytest.raises( + NotImplementedError, + match="writing to existing in-memory object is not supported", + ): + write_arrow( + table.slice(0, 1), + buffer, + driver="GeoJSON", + layer="test", + crs=meta["crs"], + geometry_type=meta["geometry_type"], + geometry_name=meta["geometry_name"] or "wkb_geometry", + ) + + @requires_arrow_write_api def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text): encoding, text = encoded_text diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py index 331470aa..4f52b4cf 100644 --- a/pyogrio/tests/test_core.py +++ b/pyogrio/tests/test_core.py @@ -467,8 +467,10 @@ def test_read_info_force_feature_count(data_dir, layer, force, expected): def test_read_info_force_total_bounds( tmpdir, naturalearth_lowres, force_total_bounds, expected_total_bounds ): - # Geojson files don't hava a fast way to determine total_bounds - geojson_path = prepare_testfile(naturalearth_lowres, dst_dir=tmpdir, ext=".geojson") + geojson_path = prepare_testfile( + naturalearth_lowres, dst_dir=tmpdir, ext=".geojsonl" + ) + info = read_info(geojson_path, force_total_bounds=force_total_bounds) if expected_total_bounds is not None: assert allclose(info["total_bounds"], expected_total_bounds) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 3f613abc..ba528497 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1,5 +1,6 @@ import contextlib from datetime import datetime +from io import BytesIO import locale import os @@ -136,8 +137,8 @@ def test_read_dataframe(naturalearth_lowres_all_ext): ] -def test_read_dataframe_vsi(naturalearth_lowres_vsi): - df = read_dataframe(naturalearth_lowres_vsi[1]) +def test_read_dataframe_vsi(naturalearth_lowres_vsi, use_arrow): + df = read_dataframe(naturalearth_lowres_vsi[1], use_arrow=use_arrow) assert len(df) == 177 @@ -1856,6 +1857,80 @@ def test_arrow_bool_exception(tmpdir, ext): _ = read_dataframe(filename, use_arrow=True) +@pytest.mark.filterwarnings("ignore:File /vsimem:RuntimeWarning") +@pytest.mark.parametrize("driver", ["GeoJSON", "GPKG"]) +def test_write_memory(naturalearth_lowres, driver): + df = read_dataframe(naturalearth_lowres) + + buffer = BytesIO() + write_dataframe(df, buffer, driver=driver, layer="test") + + assert len(buffer.getbuffer()) > 0 + + actual = read_dataframe(buffer) + assert len(actual) == len(df) + + is_json = driver == "GeoJSON" + + assert_geodataframe_equal( + actual, + df, + check_less_precise=is_json, + check_index_type=False, + check_dtype=not is_json, + ) + + +def test_write_memory_driver_required(naturalearth_lowres): + df = read_dataframe(naturalearth_lowres) + + buffer = BytesIO() + + with pytest.raises( + ValueError, + match="driver must be provided to write to in-memory file", + ): + write_dataframe(df.head(1), buffer, driver=None, layer="test") + + +@pytest.mark.parametrize("driver", ["ESRI Shapefile", "OpenFileGDB"]) +def test_write_memory_unsupported_driver(naturalearth_lowres, driver): + if driver == "OpenFileGDB" and __gdal_version__ < (3, 6, 0): + pytest.skip("OpenFileGDB write support only available for GDAL >= 3.6.0") + + df = read_dataframe(naturalearth_lowres) + + buffer = BytesIO() + + with pytest.raises( + ValueError, match=f"writing to in-memory file is not supported for {driver}" + ): + write_dataframe(df, buffer, driver=driver, layer="test") + + +@pytest.mark.parametrize("driver", ["GeoJSON", "GPKG"]) +def test_write_memory_append_unsupported(naturalearth_lowres, driver): + df = read_dataframe(naturalearth_lowres) + + buffer = BytesIO() + + with pytest.raises( + NotImplementedError, match="append is not supported for in-memory files" + ): + write_dataframe(df.head(1), buffer, driver=driver, layer="test", append=True) + + +def test_write_memory_existing_unsupported(naturalearth_lowres): + df = read_dataframe(naturalearth_lowres) + + buffer = BytesIO(b"0000") + with pytest.raises( + NotImplementedError, + match="writing to existing in-memory object is not supported", + ): + write_dataframe(df.head(1), buffer, driver="GeoJSON", layer="test") + + @pytest.mark.parametrize("ext", ["gpkg", "geojson"]) def test_non_utf8_encoding_io(tmp_path, ext, encoded_text): """Verify that we write non-UTF data to the data source diff --git a/pyogrio/tests/test_path.py b/pyogrio/tests/test_path.py index 95746b79..c1417758 100644 --- a/pyogrio/tests/test_path.py +++ b/pyogrio/tests/test_path.py @@ -1,4 +1,5 @@ import os +from pathlib import Path import contextlib from zipfile import ZipFile, ZIP_DEFLATED @@ -6,7 +7,7 @@ import pyogrio import pyogrio.raw -from pyogrio.util import vsi_path +from pyogrio.util import vsi_path, get_vsi_path try: import geopandas # NOQA @@ -330,3 +331,18 @@ def test_uri_s3(aws_env_setup): def test_uri_s3_dataframe(aws_env_setup): df = pyogrio.read_dataframe("zip+s3://fiona-testing/coutwildrnp.zip") assert len(df) == 67 + + +def test_get_vsi_path_obj_to_string(): + path = Path("/tmp/test.gpkg") + assert get_vsi_path(path) == (str(path), None) + + +def test_get_vsi_path_fixtures_to_string(tmpdir, tmp_path): + # tmpdir uses a private class LocalPath in pytest so we have to test it using + # the fixture instead of making an instance + path = tmpdir / "test.gpkg" + assert get_vsi_path(path) == (str(path), None) + + path = tmp_path / "test.gpkg" + assert get_vsi_path(path) == (str(path), None) diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 915c9438..36737bfa 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -1,5 +1,6 @@ import contextlib import ctypes +from io import BytesIO import json import os import sys @@ -1061,6 +1062,91 @@ def test_write_float_nan_null_arrow(tmp_path): assert pc.is_nan(table["col"]).to_pylist() == [False, True] +@pytest.mark.filterwarnings("ignore:File /vsimem:RuntimeWarning") +@pytest.mark.parametrize("driver", ["GeoJSON", "GPKG"]) +def test_write_memory(naturalearth_lowres, driver): + meta, _, geometry, field_data = read(naturalearth_lowres) + meta.update({"geometry_type": "MultiPolygon"}) + + buffer = BytesIO() + write(buffer, geometry, field_data, driver=driver, layer="test", **meta) + + assert len(buffer.getbuffer()) > 0 + assert list_layers(buffer)[0][0] == "test" + + actual_meta, _, actual_geometry, actual_field_data = read(buffer) + + assert np.array_equal(actual_meta["fields"], meta["fields"]) + assert np.array_equal(actual_field_data, field_data) + assert len(actual_geometry) == len(geometry) + + +def test_write_memory_driver_required(naturalearth_lowres): + meta, _, geometry, field_data = read(naturalearth_lowres) + + buffer = BytesIO() + with pytest.raises( + ValueError, + match="driver must be provided to write to in-memory file", + ): + write(buffer, geometry, field_data, driver=None, layer="test", **meta) + + +@pytest.mark.parametrize("driver", ["ESRI Shapefile", "OpenFileGDB"]) +def test_write_memory_unsupported_driver(naturalearth_lowres, driver): + if driver == "OpenFileGDB" and __gdal_version__ < (3, 6, 0): + pytest.skip("OpenFileGDB write support only available for GDAL >= 3.6.0") + + meta, _, geometry, field_data = read(naturalearth_lowres) + + buffer = BytesIO() + + with pytest.raises( + ValueError, match=f"writing to in-memory file is not supported for {driver}" + ): + write( + buffer, + geometry, + field_data, + driver=driver, + layer="test", + append=True, + **meta, + ) + + +@pytest.mark.parametrize("driver", ["GeoJSON", "GPKG"]) +def test_write_memory_append_unsupported(naturalearth_lowres, driver): + meta, _, geometry, field_data = read(naturalearth_lowres) + meta.update({"geometry_type": "MultiPolygon"}) + + buffer = BytesIO() + + with pytest.raises( + NotImplementedError, match="append is not supported for in-memory files" + ): + write( + buffer, + geometry, + field_data, + driver=driver, + layer="test", + append=True, + **meta, + ) + + +def test_write_memory_existing_unsupported(naturalearth_lowres): + meta, _, geometry, field_data = read(naturalearth_lowres) + + buffer = BytesIO(b"0000") + with pytest.raises( + NotImplementedError, + match="writing to existing in-memory object is not supported", + ): + write(buffer, geometry, field_data, driver="GeoJSON", layer="test", **meta) + + @pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"]) @pytest.mark.parametrize( "read_encoding,write_encoding", diff --git a/pyogrio/util.py b/pyogrio/util.py index 12b5ae10..6bd15f7f 100644 --- a/pyogrio/util.py +++ b/pyogrio/util.py @@ -1,3 +1,4 @@ +from pathlib import Path import re import sys from urllib.parse import urlparse @@ -11,8 +12,22 @@ def get_vsi_path(path_or_buffer): + # force path objects to string to specifically ignore their read method + if ( + isinstance(path_or_buffer, Path) + # TODO: check for pytest LocalPath can be removed when all instances of tmpdir in fixtures are removed + or "_pytest._py.path.LocalPath" in str(type(path_or_buffer)) + ): + path_or_buffer = str(path_or_buffer) + if hasattr(path_or_buffer, "read"): - path_or_buffer = path_or_buffer.read() + bytes_read = path_or_buffer.read() + + # rewind buffer if possible so that subsequent operations do not need to rewind + if hasattr(path_or_buffer, "seek"): + path_or_buffer.seek(0) + + path_or_buffer = bytes_read buffer = None if isinstance(path_or_buffer, bytes):