diff --git a/.github/workflows/tests-conda.yml b/.github/workflows/tests-conda.yml index 097692c4..efcb18d4 100644 --- a/.github/workflows/tests-conda.yml +++ b/.github/workflows/tests-conda.yml @@ -66,4 +66,4 @@ jobs: - name: Test run: | - pytest -v -r s pyogrio/tests + pytest -v --color=yes -r s pyogrio/tests diff --git a/CHANGES.md b/CHANGES.md index 664172d8..5b5c5caf 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,7 @@ ### Improvements +- Support reading and writing datetimes with timezones (#253). - Support writing dataframes without geometry column (#267). - Calculate feature count by iterating over features if GDAL returns an unknown count for a data layer (e.g., OSM driver); this may have signficant diff --git a/docs/source/introduction.md b/docs/source/introduction.md index 3fd0c324..e7ac8cf1 100644 --- a/docs/source/introduction.md +++ b/docs/source/introduction.md @@ -464,6 +464,27 @@ You can also read from a URL with this syntax: >>> read_dataframe("zip+https://s3.amazonaws.com/bucket/shapefile.zip") ``` +## Reading and writing DateTimes + +GDAL only supports datetimes at a millisecond resolution. Reading data will thus +give at most millisecond resolution (`datetime64[ms]` data type). With pandas 2.0 +`pyogrio.read_dataframe()` will return datetime data as `datetime64[ms]` +correspondingly. For previous versions of pandas, `datetime64[ns]` is used as +ms precision was not supported. When writing, only precision up to +ms is retained. + +Not all file formats have dedicated support to store datetime data, like ESRI +Shapefile. For such formats, or if you require precision > ms, a workaround is to +convert the datetimes to string. + +Timezone information is preserved where possible, however GDAL only represents +time zones as UTC offsets, whilst pandas uses IANA time zones (via `pytz` or +`zoneinfo`). This means that dataframes with columns containing multiple offsets +(e.g. when switching from standard time to summer time) will be written correctly, +but when read via `pyogrio.read_dataframe()` will be returned as a UTC datetime +column, as there is no way to reconstruct the original timezone from the individual +offsets present. + ## Dataset and layer creation options It is possible to use dataset and layer creation options available for a given diff --git a/docs/source/known_issues.md b/docs/source/known_issues.md index 3d84215e..29d390ee 100644 --- a/docs/source/known_issues.md +++ b/docs/source/known_issues.md @@ -52,20 +52,6 @@ Pyogrio does not currently validate attribute values or geometry types before attempting to write to the output file. Invalid types may crash during writing with obscure error messages. -## Support for reading and writing DateTimes - -GDAL only supports datetimes at a millisecond resolution. Reading data will thus -give at most millisecond resolution (`datetime64[ms]` data type), even though -the data is cast `datetime64[ns]` data type when reading into a data frame -using `pyogrio.read_dataframe()`. When writing, only precision up to ms is retained. - -Not all file formats have dedicated support to store datetime data, like ESRI -Shapefile. For such formats, or if you require precision > ms, a workaround is to -convert the datetimes to string. - -Timezone information is ignored at the moment, both when reading and when writing -datetime columns. - ## Support for OpenStreetMap (OSM) data OpenStreetMap data do not natively support calculating the feature count by data diff --git a/pyogrio/_compat.py b/pyogrio/_compat.py index 596abeb0..9e932f3f 100644 --- a/pyogrio/_compat.py +++ b/pyogrio/_compat.py @@ -18,11 +18,18 @@ except ImportError: geopandas = None +try: + import pandas +except ImportError: + pandas = None + HAS_ARROW_API = __gdal_version__ >= (3, 6, 0) and pyarrow is not None HAS_GEOPANDAS = geopandas is not None +PANDAS_GE_20 = pandas is not None and Version(pandas.__version__) >= Version("2.0.0") + HAS_GDAL_GEOS = __gdal_geos_version__ is not None HAS_SHAPELY = shapely is not None and Version(shapely.__version__) >= Version("2.0.0") diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index a9402536..12fb85a3 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -724,7 +724,8 @@ cdef process_fields( object field_data_view, object field_indexes, object field_ogr_types, - encoding + encoding, + bint datetime_as_string ): cdef int j cdef int success @@ -756,7 +757,7 @@ cdef process_fields( else: data[i] = np.nan - elif field_type in ( OFTDate, OFTDateTime): + elif field_type in ( OFTDate, OFTDateTime) and not datetime_as_string: data[i] = np.datetime64('NaT') else: @@ -782,22 +783,28 @@ cdef process_fields( data[i] = bin_value[:ret_length] elif field_type == OFTDateTime or field_type == OFTDate: - success = OGR_F_GetFieldAsDateTimeEx( - ogr_feature, field_index, &year, &month, &day, &hour, &minute, &fsecond, &timezone) + + if datetime_as_string: + # defer datetime parsing to user/ pandas layer + # Update to OGR_F_GetFieldAsISO8601DateTime when GDAL 3.7+ only + data[i] = get_string(OGR_F_GetFieldAsString(ogr_feature, field_index), encoding=encoding) + else: + success = OGR_F_GetFieldAsDateTimeEx( + ogr_feature, field_index, &year, &month, &day, &hour, &minute, &fsecond, &timezone) - ms, ss = math.modf(fsecond) - second = int(ss) - # fsecond has millisecond accuracy - microsecond = round(ms * 1000) * 1000 + ms, ss = math.modf(fsecond) + second = int(ss) + # fsecond has millisecond accuracy + microsecond = round(ms * 1000) * 1000 - if not success: - data[i] = np.datetime64('NaT') + if not success: + data[i] = np.datetime64('NaT') - elif field_type == OFTDate: - data[i] = datetime.date(year, month, day).isoformat() + elif field_type == OFTDate: + data[i] = datetime.date(year, month, day).isoformat() - elif field_type == OFTDateTime: - data[i] = datetime.datetime(year, month, day, hour, minute, second, microsecond).isoformat() + elif field_type == OFTDateTime: + data[i] = datetime.datetime(year, month, day, hour, minute, second, microsecond).isoformat() @cython.boundscheck(False) # Deactivate bounds checking @@ -810,7 +817,8 @@ cdef get_features( uint8_t force_2d, int skip_features, int num_features, - uint8_t return_fids + uint8_t return_fids, + bint datetime_as_string ): cdef OGRFeatureH ogr_feature = NULL @@ -843,7 +851,9 @@ cdef get_features( field_data = [ np.empty(shape=(num_features, ), - dtype=fields[field_index,3]) for field_index in range(n_fields) + dtype = ("object" if datetime_as_string and + fields[field_index,3].startswith("datetime") else fields[field_index,3]) + ) for field_index in range(n_fields) ] field_data_view = [field_data[field_index][:] for field_index in range(n_fields)] @@ -884,7 +894,7 @@ cdef get_features( process_fields( ogr_feature, i, n_fields, field_data, field_data_view, - field_indexes, field_ogr_types, encoding + field_indexes, field_ogr_types, encoding, datetime_as_string ) i += 1 finally: @@ -914,7 +924,8 @@ cdef get_features_by_fid( object[:,:] fields, encoding, uint8_t read_geometry, - uint8_t force_2d + uint8_t force_2d, + bint datetime_as_string ): cdef OGRFeatureH ogr_feature = NULL @@ -937,10 +948,11 @@ cdef get_features_by_fid( n_fields = fields.shape[0] field_indexes = fields[:,0] field_ogr_types = fields[:,1] - field_data = [ np.empty(shape=(count, ), - dtype=fields[field_index,3]) for field_index in range(n_fields) + dtype=("object" if datetime_as_string and fields[field_index,3].startswith("datetime") + else fields[field_index,3])) + for field_index in range(n_fields) ] field_data_view = [field_data[field_index][:] for field_index in range(n_fields)] @@ -963,7 +975,7 @@ cdef get_features_by_fid( process_fields( ogr_feature, i, n_fields, field_data, field_data_view, - field_indexes, field_ogr_types, encoding + field_indexes, field_ogr_types, encoding, datetime_as_string ) finally: if ogr_feature != NULL: @@ -1063,7 +1075,9 @@ def ogr_read( object fids=None, str sql=None, str sql_dialect=None, - int return_fids=False): + int return_fids=False, + bint datetime_as_string=False + ): cdef int err = 0 cdef const char *path_c = NULL @@ -1161,6 +1175,7 @@ def ogr_read( encoding, read_geometry=read_geometry and geometry_type is not None, force_2d=force_2d, + datetime_as_string=datetime_as_string ) # bypass reading fids since these should match fids used for read @@ -1193,13 +1208,15 @@ def ogr_read( force_2d=force_2d, skip_features=skip_features, num_features=num_features, - return_fids=return_fids + return_fids=return_fids, + datetime_as_string=datetime_as_string ) meta = { 'crs': crs, 'encoding': encoding, 'fields': fields[:,2], # return only names + 'dtypes':fields[:,3], 'geometry_type': geometry_type, } @@ -1667,7 +1684,8 @@ def ogr_write( str path, str layer, str driver, geometry, fields, field_data, field_mask, str crs, str geometry_type, str encoding, object dataset_kwargs, object layer_kwargs, bint promote_to_multi=False, bint nan_as_null=True, - bint append=False, dataset_metadata=None, layer_metadata=None + bint append=False, dataset_metadata=None, layer_metadata=None, + gdal_tz_offsets=None ): cdef const char *path_c = NULL cdef const char *layer_c = NULL @@ -1738,6 +1756,9 @@ def ogr_write( if not layer: layer = os.path.splitext(os.path.split(path)[1])[0] + if gdal_tz_offsets is None: + gdal_tz_offsets = {} + # if shapefile, GeoJSON, or FlatGeobuf, always delete first # for other types, check if we can create layers @@ -2010,8 +2031,12 @@ def ogr_write( if np.isnat(field_value): OGR_F_SetFieldNull(ogr_feature, field_idx) else: - # TODO: add support for timezones datetime = field_value.astype("datetime64[ms]").item() + tz_array = gdal_tz_offsets.get(fields[field_idx], None) + if tz_array is None: + gdal_tz = 0 + else: + gdal_tz = tz_array[i] OGR_F_SetFieldDateTimeEx( ogr_feature, field_idx, @@ -2021,7 +2046,7 @@ def ogr_write( datetime.hour, datetime.minute, datetime.second + datetime.microsecond / 10**6, - 0 + gdal_tz ) else: diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 0c72a55a..a0f30f3a 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -2,7 +2,7 @@ import numpy as np -from pyogrio._compat import HAS_GEOPANDAS +from pyogrio._compat import HAS_GEOPANDAS, PANDAS_GE_20 from pyogrio.raw import ( DRIVERS_NO_MIXED_SINGLE_MULTI, DRIVERS_NO_MIXED_DIMENSIONS, @@ -12,6 +12,7 @@ write, ) from pyogrio.errors import DataSourceError +import warnings def _stringify_path(path): @@ -29,6 +30,40 @@ def _stringify_path(path): return path +def _try_parse_datetime(ser): + import pandas as pd # only called when pandas is known to be installed + + if PANDAS_GE_20: + datetime_kwargs = dict(format="ISO8601", errors="ignore") + else: + datetime_kwargs = dict(yearfirst=True) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + ".*parsing datetimes with mixed time zones will raise.*", + FutureWarning, + ) + # pre-emptive try catch for when pandas will raise + # (can tighten the exception type in future when it does) + try: + res = pd.to_datetime(ser, **datetime_kwargs) + except Exception: + pass + # if object dtype, try parse as utc instead + if res.dtype == "object": + res = pd.to_datetime(ser, utc=True, **datetime_kwargs) + + if res.dtype != "object": + # GDAL only supports ms precision, convert outputs to match. + # Pandas 2.0 supports datetime[ms] directly, prior versions only support [ns], + # Instead, round the values to [ms] precision. + if PANDAS_GE_20: + res = res.dt.as_unit("ms") + else: + res = res.dt.round(freq="ms") + return res + + def read_dataframe( path_or_buffer, /, @@ -196,6 +231,11 @@ def read_dataframe( read_func = read_arrow if use_arrow else read gdal_force_2d = False if use_arrow else force_2d + if not use_arrow: + # For arrow, datetimes are read as is. + # For numpy IO, datetimes are read as string values to preserve timezone info + # as numpy does not directly support timezones. + kwargs["datetime_as_string"] = True result = read_func( path_or_buffer, layer=layer, @@ -250,8 +290,10 @@ def read_dataframe( index = pd.Index(index, name="fid") else: index = None - df = pd.DataFrame(data, columns=columns, index=index) + for dtype, c in zip(meta["dtypes"], df.columns): + if dtype.startswith("datetime"): + df[c] = _try_parse_datetime(df[c]) if geometry is None or not read_geometry: return df @@ -393,19 +435,37 @@ def write_dataframe( # TODO: may need to fill in pd.NA, etc field_data = [] field_mask = [] + # dict[str, np.array(int)] special case for dt-tz fields + gdal_tz_offsets = {} for name in fields: - col = df[name].values - if isinstance(col, pd.api.extensions.ExtensionArray): + col = df[name] + if isinstance(col.dtype, pd.DatetimeTZDtype): + # Deal with datetimes with timezones by passing down timezone separately + # pass down naive datetime + naive = col.dt.tz_localize(None) + values = naive.values + # compute offset relative to UTC explicitly + tz_offset = naive - col.dt.tz_convert("UTC").dt.tz_localize(None) + # Convert to GDAL timezone offset representation. + # GMT is represented as 100 and offsets are represented by adding / + # subtracting 1 for every 15 minutes different from GMT. + # https://gdal.org/development/rfc/rfc56_millisecond_precision.html#core-changes + # Convert each row offset to a signed multiple of 15m and add to GMT value + gdal_offset_representation = tz_offset // pd.Timedelta("15m") + 100 + gdal_tz_offsets[name] = gdal_offset_representation + else: + values = col.values + if isinstance(values, pd.api.extensions.ExtensionArray): from pandas.arrays import IntegerArray, FloatingArray, BooleanArray - if isinstance(col, (IntegerArray, FloatingArray, BooleanArray)): - field_data.append(col._data) - field_mask.append(col._mask) + if isinstance(values, (IntegerArray, FloatingArray, BooleanArray)): + field_data.append(values._data) + field_mask.append(values._mask) else: - field_data.append(np.asarray(col)) - field_mask.append(np.asarray(col.isna())) + field_data.append(np.asarray(values)) + field_mask.append(np.asarray(values.isna())) else: - field_data.append(col) + field_data.append(values) field_mask.append(None) # Determine geometry_type and/or promote_to_multi @@ -500,5 +560,6 @@ def write_dataframe( metadata=metadata, dataset_options=dataset_options, layer_options=layer_options, + gdal_tz_offsets=gdal_tz_offsets, **kwargs, ) diff --git a/pyogrio/raw.py b/pyogrio/raw.py index b19484fc..4384ee9a 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -49,6 +49,7 @@ def read( sql=None, sql_dialect=None, return_fids=False, + datetime_as_string=False, **kwargs, ): """Read OGR data source into numpy arrays. @@ -141,6 +142,11 @@ def read( return_fids : bool, optional (default: False) If True, will return the FIDs of the feature that were read. + datetime_as_string : bool, optional (default: False) + If True, will return datetime dtypes as detected by GDAL as a string + array (which can be used to extract timezone info), instead of + a datetime64 array. + **kwargs Additional driver-specific dataset open options passed to OGR. Invalid options will trigger a warning. @@ -158,6 +164,7 @@ def read( Meta is: { "crs": "", "fields": , + "dtypes": "encoding": "", "geometry_type": "" } @@ -201,6 +208,7 @@ def read( sql_dialect=sql_dialect, return_fids=return_fids, dataset_kwargs=dataset_kwargs, + datetime_as_string=datetime_as_string, ) finally: if buffer is not None: @@ -448,8 +456,12 @@ def write( metadata=None, dataset_options=None, layer_options=None, + gdal_tz_offsets=None, **kwargs, ): + # if dtypes is given, remove it from kwargs (dtypes is included in meta returned by + # read, and it is convenient to pass meta directly into write for round trip tests) + kwargs.pop("dtypes", None) path = vsi_path(str(path)) if driver is None: @@ -533,4 +545,5 @@ def write( layer_metadata=layer_metadata, dataset_kwargs=dataset_kwargs, layer_kwargs=layer_kwargs, + gdal_tz_offsets=gdal_tz_offsets, ) diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py index 775004ea..76327b4f 100644 --- a/pyogrio/tests/conftest.py +++ b/pyogrio/tests/conftest.py @@ -128,3 +128,8 @@ def test_ogr_types_list(): @pytest.fixture(scope="session") def test_datetime(): return _data_dir / "test_datetime.geojson" + + +@pytest.fixture(scope="session") +def test_datetime_tz(): + return _data_dir / "test_datetime_tz.geojson" diff --git a/pyogrio/tests/fixtures/test_datetime_tz.geojson b/pyogrio/tests/fixtures/test_datetime_tz.geojson new file mode 100644 index 00000000..e6b39206 --- /dev/null +++ b/pyogrio/tests/fixtures/test_datetime_tz.geojson @@ -0,0 +1,8 @@ +{ +"type": "FeatureCollection", +"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } }, +"features": [ +{ "type": "Feature", "properties": { "datetime_col": "2020-01-01T09:00:00.123-05:00" }, "geometry": { "type": "Point", "coordinates": [ 1.0, 1.0 ] } }, +{ "type": "Feature", "properties": { "datetime_col": "2020-01-01T10:00:00-05:00" }, "geometry": { "type": "Point", "coordinates": [ 2.0, 2.0 ] } } +] +} diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 1fe69e51..1c7ddfc5 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1,14 +1,12 @@ import contextlib from datetime import datetime import os -from packaging.version import Version - import numpy as np import pytest from pyogrio import list_layers, read_info, __gdal_version__ from pyogrio.errors import DataLayerError, DataSourceError, FeatureError, GeometryError -from pyogrio.geopandas import read_dataframe, write_dataframe +from pyogrio.geopandas import read_dataframe, write_dataframe, PANDAS_GE_20 from pyogrio.raw import ( DRIVERS_NO_MIXED_DIMENSIONS, DRIVERS_NO_MIXED_SINGLE_MULTI, @@ -22,13 +20,18 @@ try: import pandas as pd - from pandas.testing import assert_frame_equal, assert_index_equal + from pandas.testing import ( + assert_frame_equal, + assert_index_equal, + assert_series_equal, + ) import geopandas as gp from geopandas.array import from_wkt from geopandas.testing import assert_geodataframe_equal import shapely # if geopandas is present, shapely is expected to be present + from shapely.geometry import Point except ImportError: pass @@ -175,13 +178,67 @@ def test_read_datetime(test_fgdb_vsi, use_arrow): df = read_dataframe( test_fgdb_vsi, layer="test_lines", use_arrow=use_arrow, max_features=1 ) - if Version(pd.__version__) >= Version("2.0.0"): + if PANDAS_GE_20: # starting with pandas 2.0, it preserves the passed datetime resolution assert df.SURVEY_DAT.dtype.name == "datetime64[ms]" else: assert df.SURVEY_DAT.dtype.name == "datetime64[ns]" +def test_read_datetime_tz(test_datetime_tz, tmp_path): + df = read_dataframe(test_datetime_tz) + raw_expected = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] + + if PANDAS_GE_20: + expected = pd.to_datetime(raw_expected, format="ISO8601").as_unit("ms") + else: + expected = pd.to_datetime(raw_expected) + expected = pd.Series(expected, name="datetime_col") + assert_series_equal(df.datetime_col, expected) + # test write and read round trips + fpath = tmp_path / "test.gpkg" + write_dataframe(df, fpath) + df_read = read_dataframe(fpath) + assert_series_equal(df_read.datetime_col, expected) + + +def test_write_datetime_mixed_offset(tmp_path): + # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10) + dates = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] + naive_col = pd.Series(pd.to_datetime(dates), name="dates") + localised_col = naive_col.dt.tz_localize("Australia/Sydney") + utc_col = localised_col.dt.tz_convert("UTC") + if PANDAS_GE_20: + utc_col = utc_col.dt.as_unit("ms") + + df = gp.GeoDataFrame( + {"dates": localised_col, "geometry": [Point(1, 1), Point(1, 1)]}, + crs="EPSG:4326", + ) + fpath = tmp_path / "test.gpkg" + write_dataframe(df, fpath) + result = read_dataframe(fpath) + # GDAL tz only encodes offsets, not timezones + # check multiple offsets are read as utc datetime instead of string values + assert_series_equal(result["dates"], utc_col) + + +def test_read_write_datetime_tz_with_nulls(tmp_path): + dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", pd.NaT] + if PANDAS_GE_20: + dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + dates = pd.to_datetime(dates_raw) + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, + crs="EPSG:4326", + ) + fpath = tmp_path / "test.gpkg" + write_dataframe(df, fpath) + result = read_dataframe(fpath) + assert_geodataframe_equal(df, result) + + def test_read_null_values(test_fgdb_vsi, use_arrow): df = read_dataframe(test_fgdb_vsi, use_arrow=use_arrow, read_geometry=False) diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index a2984b8d..ca0cf5a3 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -929,6 +929,19 @@ def test_read_unsupported_ext_with_prefix(tmp_path): assert field_data[0] == "data1" +def test_read_datetime_as_string(test_datetime_tz): + field = read(test_datetime_tz)[3][0] + assert field.dtype == "datetime64[ms]" + # timezone is ignored in numpy layer + assert field[0] == np.datetime64("2020-01-01 09:00:00.123") + assert field[1] == np.datetime64("2020-01-01 10:00:00.000") + field = read(test_datetime_tz, datetime_as_string=True)[3][0] + assert field.dtype == "object" + # GDAL doesn't return strings in ISO format (yet) + assert field[0] == "2020/01/01 09:00:00.123-05" + assert field[1] == "2020/01/01 10:00:00-05" + + @pytest.mark.parametrize("ext", ["gpkg", "geojson"]) def test_read_write_null_geometry(tmp_path, ext): # Point(0, 0), null