From e075091377a37c4341bd0d4471ac950b161b8676 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sun, 7 May 2023 22:07:49 +1000 Subject: [PATCH 01/51] minimal working pandas layer without timezones --- pyogrio/_io.pyx | 34 +++++++++++++++++++++------------- pyogrio/geopandas.py | 6 ++++++ 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 9065904c..5f6c8410 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -657,22 +657,26 @@ cdef process_fields( data[i] = bin_value[:ret_length] elif field_type == OFTDateTime or field_type == OFTDate: - success = OGR_F_GetFieldAsDateTimeEx( - ogr_feature, field_index, &year, &month, &day, &hour, &minute, &fsecond, &timezone) + # defer datetime parsing to pandas layer (TODO this would be a regression in the numpy, implement datetime_as_string toggle as suggested) + value = get_string(OGR_F_GetFieldAsString(ogr_feature, field_index), encoding=encoding) + data[i] = value - ms, ss = math.modf(fsecond) - second = int(ss) - # fsecond has millisecond accuracy - microsecond = round(ms * 1000) * 1000 + # success = OGR_F_GetFieldAsDateTimeEx( + # ogr_feature, field_index, &year, &month, &day, &hour, &minute, &fsecond, &timezone) - if not success: - data[i] = np.datetime64('NaT') + # ms, ss = math.modf(fsecond) + # second = int(ss) + # # fsecond has millisecond accuracy + # microsecond = round(ms * 1000) * 1000 - elif field_type == OFTDate: - data[i] = datetime.date(year, month, day).isoformat() + # if not success: + # data[i] = np.datetime64('NaT') - elif field_type == OFTDateTime: - data[i] = datetime.datetime(year, month, day, hour, minute, second, microsecond).isoformat() + # elif field_type == OFTDate: + # data[i] = datetime.date(year, month, day).isoformat() + + # elif field_type == OFTDateTime: + # data[i] = datetime.datetime(year, month, day, hour, minute, second, microsecond).isoformat() @cython.boundscheck(False) # Deactivate bounds checking @@ -718,8 +722,11 @@ cdef get_features( field_data = [ np.empty(shape=(num_features, ), - dtype=fields[field_index,3]) for field_index in range(n_fields) + # TODO cython has walrus? + dtype=(fields[field_index,3] if not fields[field_index,3].startswith("datetime") else "object")) for field_index in range(n_fields) ] + types = [(fields[field_index,2],fields[field_index,3]) for field_index in range(n_fields)] + print("types are", types) field_data_view = [field_data[field_index][:] for field_index in range(n_fields)] i = 0 @@ -1058,6 +1065,7 @@ def ogr_read( 'crs': crs, 'encoding': encoding, 'fields': fields[:,2], # return only names + 'dtypes':fields[:,3], 'geometry_type': geometry_type, } diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index a69ddb14..ea9e52d2 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -165,6 +165,7 @@ def read_dataframe( ) if use_arrow: + # TODO what happens to datetimes with arrow? meta, table = result df = table.to_pandas() geometry_name = meta["geometry_name"] or "wkb_geometry" @@ -175,6 +176,8 @@ def read_dataframe( return df meta, index, geometry, field_data = result + print("meta is") + print(meta) columns = meta["fields"].tolist() data = {columns[i]: field_data[i] for i in range(len(columns))} @@ -184,6 +187,9 @@ def read_dataframe( index = None df = pd.DataFrame(data, columns=columns, index=index) + for dtype, c in zip(meta["dtypes"], df.columns): + if dtype.startswith("datetime"): + df[c] = pd.to_datetime(df[c], format="%Y/%m/%d %H:%M:%S%z") if geometry is None or not read_geometry: return df From 3df7936109b64b39a9d8af75d0c5b8abbb0cd75c Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sun, 7 May 2023 22:25:23 +1000 Subject: [PATCH 02/51] implement datetime_as_string toggle to get numpy layer working --- pyogrio/_io.pyx | 58 +++++++++++++++++++++++++------------------- pyogrio/geopandas.py | 4 +++ pyogrio/raw.py | 6 +++++ 3 files changed, 43 insertions(+), 25 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 5f6c8410..fe311792 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -599,7 +599,8 @@ cdef process_fields( object field_data_view, object field_indexes, object field_ogr_types, - encoding + encoding, + bint datetime_as_string ): cdef int j cdef int success @@ -657,26 +658,27 @@ cdef process_fields( data[i] = bin_value[:ret_length] elif field_type == OFTDateTime or field_type == OFTDate: - # defer datetime parsing to pandas layer (TODO this would be a regression in the numpy, implement datetime_as_string toggle as suggested) - value = get_string(OGR_F_GetFieldAsString(ogr_feature, field_index), encoding=encoding) - data[i] = value - - # success = OGR_F_GetFieldAsDateTimeEx( - # ogr_feature, field_index, &year, &month, &day, &hour, &minute, &fsecond, &timezone) + # defer datetime parsing to pandas layer + if datetime_as_string: + value = get_string(OGR_F_GetFieldAsString(ogr_feature, field_index), encoding=encoding) + data[i] = value + else: + success = OGR_F_GetFieldAsDateTimeEx( + ogr_feature, field_index, &year, &month, &day, &hour, &minute, &fsecond, &timezone) - # ms, ss = math.modf(fsecond) - # second = int(ss) - # # fsecond has millisecond accuracy - # microsecond = round(ms * 1000) * 1000 + ms, ss = math.modf(fsecond) + second = int(ss) + # fsecond has millisecond accuracy + microsecond = round(ms * 1000) * 1000 - # if not success: - # data[i] = np.datetime64('NaT') + if not success: + data[i] = np.datetime64('NaT') - # elif field_type == OFTDate: - # data[i] = datetime.date(year, month, day).isoformat() + elif field_type == OFTDate: + data[i] = datetime.date(year, month, day).isoformat() - # elif field_type == OFTDateTime: - # data[i] = datetime.datetime(year, month, day, hour, minute, second, microsecond).isoformat() + elif field_type == OFTDateTime: + data[i] = datetime.datetime(year, month, day, hour, minute, second, microsecond).isoformat() @cython.boundscheck(False) # Deactivate bounds checking @@ -689,7 +691,8 @@ cdef get_features( uint8_t force_2d, int skip_features, int num_features, - uint8_t return_fids + uint8_t return_fids, + bint datetime_as_string ): cdef OGRFeatureH ogr_feature = NULL @@ -723,7 +726,7 @@ cdef get_features( field_data = [ np.empty(shape=(num_features, ), # TODO cython has walrus? - dtype=(fields[field_index,3] if not fields[field_index,3].startswith("datetime") else "object")) for field_index in range(n_fields) + dtype=("object" if datetime_as_string and fields[field_index,3].startswith("datetime") else fields[field_index,3])) for field_index in range(n_fields) ] types = [(fields[field_index,2],fields[field_index,3]) for field_index in range(n_fields)] print("types are", types) @@ -765,7 +768,7 @@ cdef get_features( process_fields( ogr_feature, i, n_fields, field_data, field_data_view, - field_indexes, field_ogr_types, encoding + field_indexes, field_ogr_types, encoding, datetime_as_string ) i += 1 finally: @@ -795,7 +798,8 @@ cdef get_features_by_fid( object[:,:] fields, encoding, uint8_t read_geometry, - uint8_t force_2d + uint8_t force_2d, + bint datetime_as_string ): cdef OGRFeatureH ogr_feature = NULL @@ -818,7 +822,7 @@ cdef get_features_by_fid( n_fields = fields.shape[0] field_indexes = fields[:,0] field_ogr_types = fields[:,1] - + # TODO missing datetime dtype hack on this path? field_data = [ np.empty(shape=(count, ), dtype=fields[field_index,3]) for field_index in range(n_fields) @@ -844,7 +848,7 @@ cdef get_features_by_fid( process_fields( ogr_feature, i, n_fields, field_data, field_data_view, - field_indexes, field_ogr_types, encoding + field_indexes, field_ogr_types, encoding, datetime_as_string ) finally: if ogr_feature != NULL: @@ -946,7 +950,9 @@ def ogr_read( object fids=None, str sql=None, str sql_dialect=None, - int return_fids=False): + int return_fids=False, + bint datetime_as_string=False + ): cdef int err = 0 cdef const char *path_c = NULL @@ -1029,6 +1035,7 @@ def ogr_read( encoding, read_geometry=read_geometry and geometry_type is not None, force_2d=force_2d, + datetime_as_string=datetime_as_string ) # bypass reading fids since these should match fids used for read @@ -1058,7 +1065,8 @@ def ogr_read( force_2d=force_2d, skip_features=skip_features, num_features=num_features, - return_fids=return_fids + return_fids=return_fids, + datetime_as_string=datetime_as_string ) meta = { diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index ea9e52d2..c996cb3e 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -146,6 +146,10 @@ def read_dataframe( path_or_buffer = _stringify_path(path_or_buffer) read_func = read_arrow if use_arrow else read + if not read_arrow and "datetime_as_string" not in kwargs: + kwargs["datetime_as_string"] = True + # TODO work out what to do with the arrow api + result = read_func( path_or_buffer, layer=layer, diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 27bc5ed4..30b73931 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -53,6 +53,7 @@ def read( sql=None, sql_dialect=None, return_fids=False, + datetime_as_string=False, **kwargs, ): """Read OGR data source into numpy arrays. @@ -108,6 +109,10 @@ def read( number of features usings FIDs is also driver specific. return_fids : bool, optional (default: False) If True, will return the FIDs of the feature that were read. + datetime_as_string : bool, optional (default: False) + If True, will return datetime dtypes as detected by GDAL as a string + array, instead of a datetime64 array. + **kwargs Additional driver-specific dataset open options passed to OGR. Invalid options will trigger a warning. @@ -150,6 +155,7 @@ def read( sql_dialect=sql_dialect, return_fids=return_fids, dataset_kwargs=dataset_kwargs, + datetime_as_string=datetime_as_string, ) finally: if buffer is not None: From d68b47301113749c554854ed6aff0b9c850ddfc1 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Mon, 8 May 2023 20:02:46 +1000 Subject: [PATCH 03/51] make tests pass --- pyogrio/tests/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py index 6f69fd92..44b66012 100644 --- a/pyogrio/tests/conftest.py +++ b/pyogrio/tests/conftest.py @@ -43,6 +43,7 @@ def prepare_testfile(testfile_path, dst_dir, ext): # For .gpkg, spatial_index=False to avoid the rows being reordered meta["spatial_index"] = False meta["geometry_type"] = "MultiPolygon" + meta.pop("dtypes") write(dst_path, geometry, field_data, **meta) return dst_path From 9aa5a8ca028c6c0264f522a0e73703dac221cd91 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Mon, 8 May 2023 20:16:04 +1000 Subject: [PATCH 04/51] add tests showing existing behaviour no tz --- pyogrio/tests/conftest.py | 5 +++++ pyogrio/tests/test_geopandas_io.py | 14 ++++++++++++++ pyogrio/tests/test_raw_io.py | 14 ++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py index 44b66012..ef47fc96 100644 --- a/pyogrio/tests/conftest.py +++ b/pyogrio/tests/conftest.py @@ -98,3 +98,8 @@ def test_ogr_types_list(): @pytest.fixture(scope="session") def test_datetime(): return _data_dir / "test_datetime.geojson" + + +@pytest.fixture(scope="session") +def test_datetime_tz(): + return _data_dir / "test_datetime_tz.geojson" diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 8e9fd23a..9a14d1cf 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -139,6 +139,20 @@ def test_read_datetime(test_fgdb_vsi): assert df.SURVEY_DAT.dtype.name == "datetime64[ns]" +def test_read_datetime_tz(test_datetime_tz): + df = read_dataframe(test_datetime_tz) + if Version(pd.__version__) >= Version("2.0.0"): + # starting with pandas 2.0, it preserves the passed datetime resolution + assert df.col.dtype.name == "datetime64[ms]" + else: + assert df.col.dtype.name == "datetime64[ns]" + + df.col = df.col.astype("datetime64[ms]") + print(df.col) + print(df.col.dtype) + # TODO assertion + + def test_read_null_values(test_fgdb_vsi): df = read_dataframe(test_fgdb_vsi, read_geometry=False) diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index cb342618..5498bee1 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -613,6 +613,20 @@ def test_read_datetime_millisecond(test_datetime): assert field[1] == np.datetime64("2020-01-01 10:00:00.000") +def test_read_datetime_tz(test_datetime_tz): + field = read(test_datetime_tz)[3][0] + assert field.dtype == "datetime64[ms]" + assert field[0] == np.datetime64( + "2020-01-01 09:00:00.123" + ) # timezone is ignored TODO should this be fixed as well? + assert field[1] == np.datetime64("2020-01-01 10:00:00.000") + field = read(test_datetime_tz, datetime_as_string=True)[3][0] + assert field.dtype == "object" + # GDAL doesn't return strings in ISO format (yet) + assert field[0] == "2020/01/01 09:00:00.123-05" + assert field[1] == "2020/01/01 10:00:00-05" + + @pytest.mark.parametrize("ext", ["gpkg", "geojson"]) def test_read_write_null_geometry(tmp_path, ext): # Point(0, 0), null From 1a2af4d179f2fa813325850cc278504352ede059 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Mon, 8 May 2023 22:29:33 +1000 Subject: [PATCH 05/51] working read --- pyogrio/geopandas.py | 6 ++--- pyogrio/tests/test_geopandas_io.py | 40 ++++++++++++++++++++++++------ 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index c996cb3e..fa064679 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -146,10 +146,9 @@ def read_dataframe( path_or_buffer = _stringify_path(path_or_buffer) read_func = read_arrow if use_arrow else read - if not read_arrow and "datetime_as_string" not in kwargs: + if not use_arrow and "datetime_as_string" not in kwargs: kwargs["datetime_as_string"] = True # TODO work out what to do with the arrow api - result = read_func( path_or_buffer, layer=layer, @@ -189,8 +188,9 @@ def read_dataframe( index = pd.Index(index, name="fid") else: index = None - + print(data) df = pd.DataFrame(data, columns=columns, index=index) + print(df) for dtype, c in zip(meta["dtypes"], df.columns): if dtype.startswith("datetime"): df[c] = pd.to_datetime(df[c], format="%Y/%m/%d %H:%M:%S%z") diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 9a14d1cf..4d1ec7c9 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -18,7 +18,11 @@ try: import pandas as pd - from pandas.testing import assert_frame_equal, assert_index_equal + from pandas.testing import ( + assert_frame_equal, + assert_index_equal, + assert_series_equal, + ) import geopandas as gp from geopandas.array import from_wkt @@ -143,14 +147,36 @@ def test_read_datetime_tz(test_datetime_tz): df = read_dataframe(test_datetime_tz) if Version(pd.__version__) >= Version("2.0.0"): # starting with pandas 2.0, it preserves the passed datetime resolution - assert df.col.dtype.name == "datetime64[ms]" + assert df.col.dtype.name == "datetime64[ms, pytz.FixedOffset(-300)]" + else: + assert df.col.dtype.name == "datetime64[ns, pytz.FixedOffset(-300)]" + + assert_series_equal( + df.col, + pd.Series( + pd.to_datetime( + ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] + ), + name="col", + ), + ) + + df = read_dataframe(test_datetime_tz, use_arrow=True) + if Version(pd.__version__) >= Version("2.0.0"): + # starting with pandas 2.0, it preserves the passed datetime resolution + assert df.col.dtype.name == "datetime64[ms, pytz.FixedOffset(-300)]" else: - assert df.col.dtype.name == "datetime64[ns]" + assert df.col.dtype.name == "datetime64[ns, pytz.FixedOffset(-300)]" - df.col = df.col.astype("datetime64[ms]") - print(df.col) - print(df.col.dtype) - # TODO assertion + assert_series_equal( + df.col, + pd.Series( + pd.to_datetime( + ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] + ), + name="col", + ), + ) def test_read_null_values(test_fgdb_vsi): From fbd289892737e18d8f763d546dbb7593224233d2 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Tue, 9 May 2023 21:20:20 +1000 Subject: [PATCH 06/51] commit my test file --- pyogrio/tests/fixtures/test_datetime_tz.geojson | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 pyogrio/tests/fixtures/test_datetime_tz.geojson diff --git a/pyogrio/tests/fixtures/test_datetime_tz.geojson b/pyogrio/tests/fixtures/test_datetime_tz.geojson new file mode 100644 index 00000000..2fb72759 --- /dev/null +++ b/pyogrio/tests/fixtures/test_datetime_tz.geojson @@ -0,0 +1,7 @@ +{ +"type": "FeatureCollection", +"features": [ +{ "type": "Feature", "properties": { "col": "2020-01-01T09:00:00.123-05:00" }, "geometry": { "type": "Point", "coordinates": [ 1.0, 1.0 ] } }, +{ "type": "Feature", "properties": { "col": "2020-01-01T10:00:00-05:00" }, "geometry": { "type": "Point", "coordinates": [ 2.0, 2.0 ] } } +] +} From 127d0a7d5daadd8abd52e901b0213659a3c0c5c0 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Thu, 11 May 2023 09:08:05 +1000 Subject: [PATCH 07/51] actually fix tests with read working --- pyogrio/raw.py | 1 + pyogrio/tests/test_geopandas_io.py | 63 ++++++++++++++++++------------ pyogrio/tests/test_raw_io.py | 3 ++ 3 files changed, 41 insertions(+), 26 deletions(-) diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 30b73931..38f429e3 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -393,6 +393,7 @@ def write( layer_options=None, **kwargs, ): + kwargs.pop("dtypes", None) if geometry_type is None: raise ValueError("geometry_type must be provided") diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 4d1ec7c9..44d22a01 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -143,40 +143,51 @@ def test_read_datetime(test_fgdb_vsi): assert df.SURVEY_DAT.dtype.name == "datetime64[ns]" -def test_read_datetime_tz(test_datetime_tz): +def test_read_datetime_tz(test_datetime_tz, tmp_path): df = read_dataframe(test_datetime_tz) if Version(pd.__version__) >= Version("2.0.0"): # starting with pandas 2.0, it preserves the passed datetime resolution assert df.col.dtype.name == "datetime64[ms, pytz.FixedOffset(-300)]" else: assert df.col.dtype.name == "datetime64[ns, pytz.FixedOffset(-300)]" - - assert_series_equal( - df.col, - pd.Series( - pd.to_datetime( - ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] - ), - name="col", - ), + expected_dt_col = pd.Series( + pd.to_datetime(["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"]), + name="col", ) - - df = read_dataframe(test_datetime_tz, use_arrow=True) - if Version(pd.__version__) >= Version("2.0.0"): - # starting with pandas 2.0, it preserves the passed datetime resolution - assert df.col.dtype.name == "datetime64[ms, pytz.FixedOffset(-300)]" - else: - assert df.col.dtype.name == "datetime64[ns, pytz.FixedOffset(-300)]" - - assert_series_equal( - df.col, - pd.Series( - pd.to_datetime( - ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] - ), - name="col", - ), + assert_series_equal(df.col, expected_dt_col) + # test write and read round trips + fpath = tmp_path / "test.gpkg" + write_dataframe(df, fpath) + df_read = read_dataframe(fpath) + print("good\n", df.col) + import pytz + + print( + "bad\n", + df_read.col, + "\n", + df_read.col.dt.tz_localize("UTC").dt.tz_convert(pytz.FixedOffset(-300)), ) + assert_series_equal(df_read.col, expected_dt_col) + + # assert_geodataframe_equal(df, df_read) + + # df = read_dataframe(test_datetime_tz, use_arrow=True) + # if Version(pd.__version__) >= Version("2.0.0"): + # # starting with pandas 2.0, it preserves the passed datetime resolution + # assert df.col.dtype.name == "datetime64[ms, pytz.FixedOffset(-300)]" + # else: + # assert df.col.dtype.name == "datetime64[ns, pytz.FixedOffset(-300)]" + + # assert_series_equal( + # df.col, + # pd.Series( + # pd.to_datetime( + # ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] + # ), + # name="col", + # ), + # ) def test_read_null_values(test_fgdb_vsi): diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 5498bee1..2d57da90 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -260,6 +260,7 @@ def test_read_fids_unsupported_keywords(naturalearth_lowres): def test_write(tmpdir, naturalearth_lowres): meta, _, geometry, field_data = read(naturalearth_lowres) + meta.pop("dtypes") filename = os.path.join(str(tmpdir), "test.shp") write(filename, geometry, field_data, **meta) @@ -463,6 +464,7 @@ def assert_equal_result(result1, result2): def test_read_from_bytes(tmpdir, naturalearth_lowres, driver, ext): meta, index, geometry, field_data = read(naturalearth_lowres) meta.update({"geometry_type": "Unknown"}) + meta.pop("dtypes") filename = os.path.join(str(tmpdir), f"test.{ext}") write(filename, geometry, field_data, driver=driver, **meta) @@ -489,6 +491,7 @@ def test_read_from_bytes_zipped(tmpdir, naturalearth_lowres_vsi): def test_read_from_file_like(tmpdir, naturalearth_lowres, driver, ext): meta, index, geometry, field_data = read(naturalearth_lowres) meta.update({"geometry_type": "Unknown"}) + meta.pop("dtypes") filename = os.path.join(str(tmpdir), f"test.{ext}") write(filename, geometry, field_data, driver=driver, **meta) From 016778ae8936da4966f2e228cecc4cee5d811e28 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sun, 21 May 2023 20:52:22 +1000 Subject: [PATCH 08/51] good enough wip progress for now --- pyogrio/_io.pyx | 13 ++++++--- pyogrio/geopandas.py | 17 ++++++++++-- pyogrio/raw.py | 4 +++ pyogrio/tests/test_geopandas_io.py | 42 ++++++------------------------ 4 files changed, 36 insertions(+), 40 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index fe311792..f74438d7 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1488,7 +1488,7 @@ cdef infer_field_types(list dtypes): def ogr_write( str path, str layer, str driver, geometry, fields, field_data, field_mask, str crs, str geometry_type, str encoding, object dataset_kwargs, - object layer_kwargs, bint promote_to_multi=False, bint nan_as_null=True, + object layer_kwargs, tz_offsets, bint promote_to_multi=False, bint nan_as_null=True, bint append=False, dataset_metadata=None, layer_metadata=None ): cdef const char *path_c = NULL @@ -1795,6 +1795,9 @@ def ogr_write( if np.isnat(field_value): OGR_F_SetFieldNull(ogr_feature, field_idx) else: + # TODO check if 0=unknown or 1=localtime is most appropriate + tz = tz_offsets.get(fields[field_idx], 1) + print(tz) datetime = field_value.item() OGR_F_SetFieldDateTimeEx( ogr_feature, @@ -1805,14 +1808,16 @@ def ogr_write( 0, 0, 0.0, - 0 + tz ) elif field_type == OFTDateTime: if np.isnat(field_value): OGR_F_SetFieldNull(ogr_feature, field_idx) else: - # TODO: add support for timezones + # TODO check if 0=unknown or 1=localtime is most appropriate + tz = tz_offsets.get(fields[field_idx], 1) + print(tz, type(tz)) datetime = field_value.astype("datetime64[ms]").item() OGR_F_SetFieldDateTimeEx( ogr_feature, @@ -1823,7 +1828,7 @@ def ogr_write( datetime.hour, datetime.minute, datetime.second + datetime.microsecond / 10**6, - 0 + tz ) else: diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index fa064679..db70189e 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -1,3 +1,4 @@ +import datetime import numpy as np from pyogrio.raw import DRIVERS_NO_MIXED_SINGLE_MULTI, DRIVERS_NO_MIXED_DIMENSIONS from pyogrio.raw import detect_driver, read, read_arrow, write @@ -193,7 +194,7 @@ def read_dataframe( print(df) for dtype, c in zip(meta["dtypes"], df.columns): if dtype.startswith("datetime"): - df[c] = pd.to_datetime(df[c], format="%Y/%m/%d %H:%M:%S%z") + df[c] = pd.to_datetime(df[c], format="ISO8601") if geometry is None or not read_geometry: return df @@ -336,8 +337,19 @@ def write_dataframe( # TODO: may need to fill in pd.NA, etc field_data = [] field_mask = [] + tz_offsets = {} # gdal format timezone offsets by column for name in fields: - col = df[name].values + ser = df[name] + col = ser.values + if isinstance(ser.dtype, pd.DatetimeTZDtype): + if len(ser)>0: + # Assume series has same offset for all rows (enforced by definition in pandas) + tz_offsets[name] = int(ser.iloc[0].utcoffset() / datetime.timedelta(minutes=15)) +100 + # Convert to naive datetime, add offset back per above + col = ser.dt.tz_localize(None).values + + # edge case of length zero ignored, don't have a reference date + if isinstance(col, pd.api.extensions.ExtensionArray): from pandas.arrays import IntegerArray, FloatingArray, BooleanArray @@ -437,5 +449,6 @@ def write_dataframe( metadata=metadata, dataset_options=dataset_options, layer_options=layer_options, + tz_offsets=tz_offsets, **kwargs, ) diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 38f429e3..1228f672 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -391,6 +391,7 @@ def write( metadata=None, dataset_options=None, layer_options=None, + tz_offsets=None, **kwargs, ): kwargs.pop("dtypes", None) @@ -440,6 +441,8 @@ def write( "projection information defined and may not be usable in other " "systems." ) + if tz_offsets is None: + tz_offsets = {} # preprocess kwargs and split in dataset and layer creation options dataset_kwargs = _preprocess_options_key_value(dataset_options or {}) @@ -478,4 +481,5 @@ def write( layer_metadata=layer_metadata, dataset_kwargs=dataset_kwargs, layer_kwargs=layer_kwargs, + tz_offsets=tz_offsets, ) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 44d22a01..4257e7f8 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -136,6 +136,7 @@ def test_read_layer_invalid(naturalearth_lowres_all_ext): @pytest.mark.filterwarnings("ignore: Measured") def test_read_datetime(test_fgdb_vsi): df = read_dataframe(test_fgdb_vsi, layer="test_lines", max_features=1) + print(df) if Version(pd.__version__) >= Version("2.0.0"): # starting with pandas 2.0, it preserves the passed datetime resolution assert df.SURVEY_DAT.dtype.name == "datetime64[ms]" @@ -145,49 +146,22 @@ def test_read_datetime(test_fgdb_vsi): def test_read_datetime_tz(test_datetime_tz, tmp_path): df = read_dataframe(test_datetime_tz) - if Version(pd.__version__) >= Version("2.0.0"): - # starting with pandas 2.0, it preserves the passed datetime resolution - assert df.col.dtype.name == "datetime64[ms, pytz.FixedOffset(-300)]" - else: - assert df.col.dtype.name == "datetime64[ns, pytz.FixedOffset(-300)]" + + assert df.col.dtype.unit == "ns" # tz aware does not support ms resolution, even in pandas 2 expected_dt_col = pd.Series( - pd.to_datetime(["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"]), + pd.to_datetime(["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"], format="ISO8601"), name="col", ) assert_series_equal(df.col, expected_dt_col) # test write and read round trips - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / "test.geojson" # TODO gpkg doesn't work here, at least for my local gdal, writes NaT write_dataframe(df, fpath) df_read = read_dataframe(fpath) - print("good\n", df.col) - import pytz - - print( - "bad\n", - df_read.col, - "\n", - df_read.col.dt.tz_localize("UTC").dt.tz_convert(pytz.FixedOffset(-300)), - ) + print("a", df_read.col) + print("b", expected_dt_col) assert_series_equal(df_read.col, expected_dt_col) - # assert_geodataframe_equal(df, df_read) - - # df = read_dataframe(test_datetime_tz, use_arrow=True) - # if Version(pd.__version__) >= Version("2.0.0"): - # # starting with pandas 2.0, it preserves the passed datetime resolution - # assert df.col.dtype.name == "datetime64[ms, pytz.FixedOffset(-300)]" - # else: - # assert df.col.dtype.name == "datetime64[ns, pytz.FixedOffset(-300)]" - - # assert_series_equal( - # df.col, - # pd.Series( - # pd.to_datetime( - # ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] - # ), - # name="col", - # ), - # ) + def test_read_null_values(test_fgdb_vsi): From faa06311a94bd5a91bbcc9f4c56d222fe466a192 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sun, 21 May 2023 21:11:18 +1000 Subject: [PATCH 09/51] make these failures easier to read --- .github/workflows/tests-conda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests-conda.yml b/.github/workflows/tests-conda.yml index e7fe62c0..bf867e7b 100644 --- a/.github/workflows/tests-conda.yml +++ b/.github/workflows/tests-conda.yml @@ -66,4 +66,4 @@ jobs: - name: Test run: | - pytest -v -r s pyogrio/tests + pytest -v --color=yes -r s pyogrio/tests From a8c200e801d50356377ea5a994fd1bf4bc23159d Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sun, 21 May 2023 21:20:46 +1000 Subject: [PATCH 10/51] fix for non tz --- pyogrio/geopandas.py | 15 ++++++++++++++- pyogrio/tests/test_geopandas_io.py | 13 +++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index db70189e..455e7619 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -1,5 +1,7 @@ import datetime +import warnings import numpy as np +import pandas as pd from pyogrio.raw import DRIVERS_NO_MIXED_SINGLE_MULTI, DRIVERS_NO_MIXED_DIMENSIONS from pyogrio.raw import detect_driver, read, read_arrow, write from pyogrio.errors import DataSourceError @@ -19,6 +21,17 @@ def _stringify_path(path): # pass-though other objects return path +def _try_parse_datetime(ser): + # TODO surely a better way of doing this + for format in ["ISO8601", "%y/%m/%d %H:%M:%S", "%y/%m/%d %H:%M:%S.%f", "%y/%m/%d %H:%M:%S.%f%z"]: + try: + return pd.to_datetime(ser, format=format) + except ValueError: + pass + warnings.warn(f"Datetime parsing failed for column {ser.name!r}") + return ser + + def read_dataframe( path_or_buffer, @@ -194,7 +207,7 @@ def read_dataframe( print(df) for dtype, c in zip(meta["dtypes"], df.columns): if dtype.startswith("datetime"): - df[c] = pd.to_datetime(df[c], format="ISO8601") + df[c] = _try_parse_datetime(df[c]) if geometry is None or not read_geometry: return df diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 4257e7f8..8509ddb5 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -136,12 +136,13 @@ def test_read_layer_invalid(naturalearth_lowres_all_ext): @pytest.mark.filterwarnings("ignore: Measured") def test_read_datetime(test_fgdb_vsi): df = read_dataframe(test_fgdb_vsi, layer="test_lines", max_features=1) - print(df) - if Version(pd.__version__) >= Version("2.0.0"): - # starting with pandas 2.0, it preserves the passed datetime resolution - assert df.SURVEY_DAT.dtype.name == "datetime64[ms]" - else: - assert df.SURVEY_DAT.dtype.name == "datetime64[ns]" + # if Version(pd.__version__) >= Version("2.0.0"): + # # starting with pandas 2.0, it preserves the passed datetime resolution + # assert df.SURVEY_DAT.dtype.name == "datetime64[ms]" + # else: + # assert df.SURVEY_DAT.dtype.name == "datetime64[ns]" + # String reading breaks this dtype preservation + assert df.SURVEY_DAT.dtype.name == "datetime64[ns]" def test_read_datetime_tz(test_datetime_tz, tmp_path): From 604737597d10b8ec3f64c829b3c7c088432162b5 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Mon, 22 May 2023 21:14:58 +1000 Subject: [PATCH 11/51] fix some tests --- pyogrio/_io.pyx | 6 +++--- pyogrio/geopandas.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index f74438d7..e49b0704 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -725,7 +725,7 @@ cdef get_features( field_data = [ np.empty(shape=(num_features, ), - # TODO cython has walrus? + dtype=("object" if datetime_as_string and fields[field_index,3].startswith("datetime") else fields[field_index,3])) for field_index in range(n_fields) ] types = [(fields[field_index,2],fields[field_index,3]) for field_index in range(n_fields)] @@ -822,10 +822,10 @@ cdef get_features_by_fid( n_fields = fields.shape[0] field_indexes = fields[:,0] field_ogr_types = fields[:,1] - # TODO missing datetime dtype hack on this path? field_data = [ np.empty(shape=(count, ), - dtype=fields[field_index,3]) for field_index in range(n_fields) + + dtype=("object" if datetime_as_string and fields[field_index,3].startswith("datetime") else fields[field_index,3])) for field_index in range(n_fields) ] field_data_view = [field_data[field_index][:] for field_index in range(n_fields)] diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 455e7619..3cd23898 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -1,7 +1,6 @@ import datetime import warnings import numpy as np -import pandas as pd from pyogrio.raw import DRIVERS_NO_MIXED_SINGLE_MULTI, DRIVERS_NO_MIXED_DIMENSIONS from pyogrio.raw import detect_driver, read, read_arrow, write from pyogrio.errors import DataSourceError @@ -22,6 +21,7 @@ def _stringify_path(path): return path def _try_parse_datetime(ser): + import pandas as pd # TODO surely a better way of doing this for format in ["ISO8601", "%y/%m/%d %H:%M:%S", "%y/%m/%d %H:%M:%S.%f", "%y/%m/%d %H:%M:%S.%f%z"]: try: From 606156330e48777dec6ba85d113a50ddbf1a70f3 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Mon, 22 May 2023 21:44:16 +1000 Subject: [PATCH 12/51] run pre commit --- pyogrio/geopandas.py | 24 ++++++++++++++++-------- pyogrio/tests/test_geopandas_io.py | 18 +++++++++++------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 3cd23898..45b18cc4 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -20,17 +20,23 @@ def _stringify_path(path): # pass-though other objects return path + def _try_parse_datetime(ser): import pandas as pd + # TODO surely a better way of doing this - for format in ["ISO8601", "%y/%m/%d %H:%M:%S", "%y/%m/%d %H:%M:%S.%f", "%y/%m/%d %H:%M:%S.%f%z"]: - try: + for format in [ + "ISO8601", + "%y/%m/%d %H:%M:%S", + "%y/%m/%d %H:%M:%S.%f", + "%y/%m/%d %H:%M:%S.%f%z", + ]: + try: return pd.to_datetime(ser, format=format) except ValueError: pass warnings.warn(f"Datetime parsing failed for column {ser.name!r}") return ser - def read_dataframe( @@ -350,17 +356,19 @@ def write_dataframe( # TODO: may need to fill in pd.NA, etc field_data = [] field_mask = [] - tz_offsets = {} # gdal format timezone offsets by column + tz_offsets = {} # gdal format timezone offsets by column for name in fields: ser = df[name] col = ser.values if isinstance(ser.dtype, pd.DatetimeTZDtype): - if len(ser)>0: - # Assume series has same offset for all rows (enforced by definition in pandas) - tz_offsets[name] = int(ser.iloc[0].utcoffset() / datetime.timedelta(minutes=15)) +100 + if len(ser) > 0: + # Assume series has same offset for all rows (enforced by pandas) + tz_offsets[name] = ( + int(ser.iloc[0].utcoffset() / datetime.timedelta(minutes=15)) + 100 + ) # Convert to naive datetime, add offset back per above col = ser.dt.tz_localize(None).values - + # edge case of length zero ignored, don't have a reference date if isinstance(col, pd.api.extensions.ExtensionArray): diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 8509ddb5..b8d32d16 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1,7 +1,6 @@ import contextlib from datetime import datetime import os -from packaging.version import Version import numpy as np import pytest @@ -147,15 +146,22 @@ def test_read_datetime(test_fgdb_vsi): def test_read_datetime_tz(test_datetime_tz, tmp_path): df = read_dataframe(test_datetime_tz) - - assert df.col.dtype.unit == "ns" # tz aware does not support ms resolution, even in pandas 2 + + assert ( + df.col.dtype.unit == "ns" + ) # tz aware does not support ms resolution, even in pandas 2 expected_dt_col = pd.Series( - pd.to_datetime(["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"], format="ISO8601"), + pd.to_datetime( + ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"], + format="ISO8601", + ), name="col", ) assert_series_equal(df.col, expected_dt_col) # test write and read round trips - fpath = tmp_path / "test.geojson" # TODO gpkg doesn't work here, at least for my local gdal, writes NaT + fpath = ( + tmp_path / "test.geojson" + ) # TODO gpkg doesn't work here, at least for my local gdal, writes NaT write_dataframe(df, fpath) df_read = read_dataframe(fpath) print("a", df_read.col) @@ -163,8 +169,6 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path): assert_series_equal(df_read.col, expected_dt_col) - - def test_read_null_values(test_fgdb_vsi): df = read_dataframe(test_fgdb_vsi, read_geometry=False) From 3ba42cf37063a5618cb1e58bfc624321ff949afd Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Tue, 23 May 2023 21:12:22 +1000 Subject: [PATCH 13/51] maybe old pandas, can't reproduce locally --- pyogrio/geopandas.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 45b18cc4..276a7766 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -22,14 +22,24 @@ def _stringify_path(path): def _try_parse_datetime(ser): - import pandas as pd + import pandas as pd # only called in a block where pandas is known to be installed + try: + return pd.to_datetime(ser, format="ISO8601") + except ValueError: + pass + try: + return pd.to_datetime(ser, yearfirst=True) + except ValueError: + pass # TODO surely a better way of doing this for format in [ - "ISO8601", "%y/%m/%d %H:%M:%S", "%y/%m/%d %H:%M:%S.%f", "%y/%m/%d %H:%M:%S.%f%z", + "%y/%m/%dT%H:%M:%S", + "%y/%m/%dT%H:%M:%S.%f", + "%y/%m/%dT%H:%M:%S.%f%z", ]: try: return pd.to_datetime(ser, format=format) From d98314062ac68f865fdc5b6131adc0857b4005ef Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Tue, 23 May 2023 21:26:38 +1000 Subject: [PATCH 14/51] try and find something pandas 1.5 also happy with --- pyogrio/tests/test_geopandas_io.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index b8d32d16..037417fe 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -153,7 +153,9 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path): expected_dt_col = pd.Series( pd.to_datetime( ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"], - format="ISO8601", + format='mixed', + yearfirst=True, + dayfirst=True ), name="col", ) From e9993bdbc4e6726507d57534872ef39cd67a5d55 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Tue, 23 May 2023 21:27:45 +1000 Subject: [PATCH 15/51] lint --- pyogrio/geopandas.py | 3 ++- pyogrio/tests/test_geopandas_io.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 276a7766..9182f67c 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -22,7 +22,8 @@ def _stringify_path(path): def _try_parse_datetime(ser): - import pandas as pd # only called in a block where pandas is known to be installed + import pandas as pd # only called in a block where pandas is known to be installed + try: return pd.to_datetime(ser, format="ISO8601") except ValueError: diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 037417fe..293a819e 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -153,9 +153,9 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path): expected_dt_col = pd.Series( pd.to_datetime( ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"], - format='mixed', + format="mixed", yearfirst=True, - dayfirst=True + dayfirst=True, ), name="col", ) From b6ca5cf216af1b04097163d4f22e49e90563bcc8 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Tue, 23 May 2023 21:39:06 +1000 Subject: [PATCH 16/51] simple answer --- pyogrio/tests/test_geopandas_io.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 293a819e..397a85c6 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1,7 +1,7 @@ import contextlib from datetime import datetime import os - +from packaging.version import Version import numpy as np import pytest @@ -150,12 +150,14 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path): assert ( df.col.dtype.unit == "ns" ) # tz aware does not support ms resolution, even in pandas 2 + if Version(pd.__version__) >= Version("2.0.0"): + format_ = "ISO8601" + else: + format_ = None expected_dt_col = pd.Series( pd.to_datetime( ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"], - format="mixed", - yearfirst=True, - dayfirst=True, + format=format_, ), name="col", ) From 05cc1cf2e36ff34c271b13d73a3c6b4098ee203c Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Thu, 25 May 2023 19:49:42 +1000 Subject: [PATCH 17/51] cleanup --- pyogrio/_io.pyx | 9 +++------ pyogrio/geopandas.py | 8 +++----- pyogrio/tests/test_raw_io.py | 8 ++------ 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index e49b0704..c59787c5 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -725,11 +725,10 @@ cdef get_features( field_data = [ np.empty(shape=(num_features, ), - - dtype=("object" if datetime_as_string and fields[field_index,3].startswith("datetime") else fields[field_index,3])) for field_index in range(n_fields) + dtype = ("object" if datetime_as_string and + fields[field_index,3].startswith("datetime") else fields[field_index,3]) + ) for field_index in range(n_fields) ] - types = [(fields[field_index,2],fields[field_index,3]) for field_index in range(n_fields)] - print("types are", types) field_data_view = [field_data[field_index][:] for field_index in range(n_fields)] i = 0 @@ -1797,7 +1796,6 @@ def ogr_write( else: # TODO check if 0=unknown or 1=localtime is most appropriate tz = tz_offsets.get(fields[field_idx], 1) - print(tz) datetime = field_value.item() OGR_F_SetFieldDateTimeEx( ogr_feature, @@ -1817,7 +1815,6 @@ def ogr_write( else: # TODO check if 0=unknown or 1=localtime is most appropriate tz = tz_offsets.get(fields[field_idx], 1) - print(tz, type(tz)) datetime = field_value.astype("datetime64[ms]").item() OGR_F_SetFieldDateTimeEx( ogr_feature, diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 9182f67c..96fc54da 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -210,8 +210,6 @@ def read_dataframe( return df meta, index, geometry, field_data = result - print("meta is") - print(meta) columns = meta["fields"].tolist() data = {columns[i]: field_data[i] for i in range(len(columns))} @@ -219,9 +217,7 @@ def read_dataframe( index = pd.Index(index, name="fid") else: index = None - print(data) df = pd.DataFrame(data, columns=columns, index=index) - print(df) for dtype, c in zip(meta["dtypes"], df.columns): if dtype.startswith("datetime"): df[c] = _try_parse_datetime(df[c]) @@ -375,7 +371,9 @@ def write_dataframe( if len(ser) > 0: # Assume series has same offset for all rows (enforced by pandas) tz_offsets[name] = ( - int(ser.iloc[0].utcoffset() / datetime.timedelta(minutes=15)) + 100 + # gdal stores timezones in 15 minute increments offset from 100=GMT + int(ser.iloc[0].utcoffset() / datetime.timedelta(minutes=15)) + + 100 ) # Convert to naive datetime, add offset back per above col = ser.dt.tz_localize(None).values diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 2d57da90..19d9d049 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -260,7 +260,6 @@ def test_read_fids_unsupported_keywords(naturalearth_lowres): def test_write(tmpdir, naturalearth_lowres): meta, _, geometry, field_data = read(naturalearth_lowres) - meta.pop("dtypes") filename = os.path.join(str(tmpdir), "test.shp") write(filename, geometry, field_data, **meta) @@ -464,7 +463,6 @@ def assert_equal_result(result1, result2): def test_read_from_bytes(tmpdir, naturalearth_lowres, driver, ext): meta, index, geometry, field_data = read(naturalearth_lowres) meta.update({"geometry_type": "Unknown"}) - meta.pop("dtypes") filename = os.path.join(str(tmpdir), f"test.{ext}") write(filename, geometry, field_data, driver=driver, **meta) @@ -491,7 +489,6 @@ def test_read_from_bytes_zipped(tmpdir, naturalearth_lowres_vsi): def test_read_from_file_like(tmpdir, naturalearth_lowres, driver, ext): meta, index, geometry, field_data = read(naturalearth_lowres) meta.update({"geometry_type": "Unknown"}) - meta.pop("dtypes") filename = os.path.join(str(tmpdir), f"test.{ext}") write(filename, geometry, field_data, driver=driver, **meta) @@ -619,9 +616,8 @@ def test_read_datetime_millisecond(test_datetime): def test_read_datetime_tz(test_datetime_tz): field = read(test_datetime_tz)[3][0] assert field.dtype == "datetime64[ms]" - assert field[0] == np.datetime64( - "2020-01-01 09:00:00.123" - ) # timezone is ignored TODO should this be fixed as well? + # timezone is ignored in numpy layer + assert field[0] == np.datetime64("2020-01-01 09:00:00.123") assert field[1] == np.datetime64("2020-01-01 10:00:00.000") field = read(test_datetime_tz, datetime_as_string=True)[3][0] assert field.dtype == "object" From a78a76cecbbfea20785c3247fd118463f30038b5 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 3 Jun 2023 13:46:51 +1000 Subject: [PATCH 18/51] wip, use strings to make multi timezones round trip --- pyogrio/_io.pyx | 14 +++++++++----- pyogrio/geopandas.py | 29 +++++++++++++++------------- pyogrio/raw.py | 8 ++++---- pyogrio/tests/test_geopandas_io.py | 31 +++++++++++++++++++++++++----- 4 files changed, 55 insertions(+), 27 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index c59787c5..a3e2783a 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1487,7 +1487,8 @@ cdef infer_field_types(list dtypes): def ogr_write( str path, str layer, str driver, geometry, fields, field_data, field_mask, str crs, str geometry_type, str encoding, object dataset_kwargs, - object layer_kwargs, tz_offsets, bint promote_to_multi=False, bint nan_as_null=True, + object layer_kwargs, #tz_offsets, + bint promote_to_multi=False, bint nan_as_null=True, bint append=False, dataset_metadata=None, layer_metadata=None ): cdef const char *path_c = NULL @@ -1795,8 +1796,9 @@ def ogr_write( OGR_F_SetFieldNull(ogr_feature, field_idx) else: # TODO check if 0=unknown or 1=localtime is most appropriate - tz = tz_offsets.get(fields[field_idx], 1) + # tz = tz_offsets.get(fields[field_idx], 1) datetime = field_value.item() + # print("writing date", tz, datetime) OGR_F_SetFieldDateTimeEx( ogr_feature, field_idx, @@ -1806,7 +1808,7 @@ def ogr_write( 0, 0, 0.0, - tz + 0 ) elif field_type == OFTDateTime: @@ -1814,8 +1816,10 @@ def ogr_write( OGR_F_SetFieldNull(ogr_feature, field_idx) else: # TODO check if 0=unknown or 1=localtime is most appropriate - tz = tz_offsets.get(fields[field_idx], 1) + # tz = tz_offsets.get(fields[field_idx], 1) datetime = field_value.astype("datetime64[ms]").item() + # print(f"writing date=({tz}), dt=({datetime})") + OGR_F_SetFieldDateTimeEx( ogr_feature, field_idx, @@ -1825,7 +1829,7 @@ def ogr_write( datetime.hour, datetime.minute, datetime.second + datetime.microsecond / 10**6, - tz + 0 ) else: diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 96fc54da..86eccba1 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -1,4 +1,3 @@ -import datetime import warnings import numpy as np from pyogrio.raw import DRIVERS_NO_MIXED_SINGLE_MULTI, DRIVERS_NO_MIXED_DIMENSIONS @@ -24,6 +23,8 @@ def _stringify_path(path): def _try_parse_datetime(ser): import pandas as pd # only called in a block where pandas is known to be installed + print("got series", ser, ser.to_dict()) + try: return pd.to_datetime(ser, format="ISO8601") except ValueError: @@ -34,7 +35,7 @@ def _try_parse_datetime(ser): pass # TODO surely a better way of doing this - for format in [ + for fmt in [ "%y/%m/%d %H:%M:%S", "%y/%m/%d %H:%M:%S.%f", "%y/%m/%d %H:%M:%S.%f%z", @@ -43,10 +44,11 @@ def _try_parse_datetime(ser): "%y/%m/%dT%H:%M:%S.%f%z", ]: try: - return pd.to_datetime(ser, format=format) + return pd.to_datetime(ser, format=fmt) except ValueError: pass warnings.warn(f"Datetime parsing failed for column {ser.name!r}") + print("---------------failed parsing") return ser @@ -363,21 +365,22 @@ def write_dataframe( # TODO: may need to fill in pd.NA, etc field_data = [] field_mask = [] - tz_offsets = {} # gdal format timezone offsets by column + # dtypes = df[fields].dtypes().to_dict() + # tz_offsets = {} # gdal format timezone offsets by column for name in fields: ser = df[name] col = ser.values if isinstance(ser.dtype, pd.DatetimeTZDtype): if len(ser) > 0: # Assume series has same offset for all rows (enforced by pandas) - tz_offsets[name] = ( - # gdal stores timezones in 15 minute increments offset from 100=GMT - int(ser.iloc[0].utcoffset() / datetime.timedelta(minutes=15)) - + 100 - ) - # Convert to naive datetime, add offset back per above - col = ser.dt.tz_localize(None).values - + # tz_offsets[name] = ( + # # gdal stores tz in 15 minute increments offset from 100=GMT + # int(ser.iloc[0].utcoffset() / datetime.timedelta(minutes=15)) + # + 100 + # ) + # Convert to naive datetime, offset passed through via dtypes + # col = ser.dt.tz_localize(None).values + col = ser.astype(str) # edge case of length zero ignored, don't have a reference date if isinstance(col, pd.api.extensions.ExtensionArray): @@ -479,6 +482,6 @@ def write_dataframe( metadata=metadata, dataset_options=dataset_options, layer_options=layer_options, - tz_offsets=tz_offsets, + # tz_offsets=tz_offsets, **kwargs, ) diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 1228f672..0a9c746b 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -391,7 +391,7 @@ def write( metadata=None, dataset_options=None, layer_options=None, - tz_offsets=None, + # tz_offsets=None, **kwargs, ): kwargs.pop("dtypes", None) @@ -441,8 +441,8 @@ def write( "projection information defined and may not be usable in other " "systems." ) - if tz_offsets is None: - tz_offsets = {} + # if tz_offsets is None: + # tz_offsets = {} # preprocess kwargs and split in dataset and layer creation options dataset_kwargs = _preprocess_options_key_value(dataset_options or {}) @@ -481,5 +481,5 @@ def write( layer_metadata=layer_metadata, dataset_kwargs=dataset_kwargs, layer_kwargs=layer_kwargs, - tz_offsets=tz_offsets, + # tz_offsets=tz_offsets, ) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 397a85c6..fae710bc 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -22,6 +22,7 @@ assert_index_equal, assert_series_equal, ) + import pytz import geopandas as gp from geopandas.array import from_wkt @@ -163,16 +164,36 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path): ) assert_series_equal(df.col, expected_dt_col) # test write and read round trips - fpath = ( - tmp_path / "test.geojson" - ) # TODO gpkg doesn't work here, at least for my local gdal, writes NaT + # TODO gpkg doesn't work here, at least for my local gdal, writes NaT + fpath = tmp_path / "test.geojson" write_dataframe(df, fpath) df_read = read_dataframe(fpath) - print("a", df_read.col) - print("b", expected_dt_col) assert_series_equal(df_read.col, expected_dt_col) +def test_write_datetime_mixed_offset(tmp_path): + # Summer Time (GMT+11), standard time (GMT+10) + dates = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] + tz = pytz.timezone("Australia/Sydney") + ser_naive = pd.Series(pd.to_datetime(dates), name="dates") + ser_localised = ser_naive.dt.tz_localize(tz) + df = gp.GeoDataFrame( + {"dates": ser_localised, "geometry": [Point(1, 1), Point(1, 1)]} + ) + write_dataframe(df, "foo.geojson") + df_no_tz = read_dataframe( + "foo.geojson", datetime_as_string=False + ) # TODO this shouldn't be called datetime as string in the pandas layer, + # should it even be accessible? + # datetime_as_string=False ignores tz info, returns datetime objects + expected = ser_naive.astype("datetime64[ms]") + assert_series_equal(expected, df_no_tz["dates"]) + # datetime_as_string=True keeps tz info, but pandas can't handle multiple offsets + # unless given a timezone to identify them with -> returned as strings + df_local = read_dataframe("foo.geojson", datetime_as_string=True) + assert_series_equal(ser_localised.astype("object"), df_local["dates"]) + + def test_read_null_values(test_fgdb_vsi): df = read_dataframe(test_fgdb_vsi, read_geometry=False) From b6816567f8edab6e63430151b2c586a7379622a8 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 3 Jun 2023 13:49:16 +1000 Subject: [PATCH 19/51] use tmp path fixture --- pyogrio/tests/test_geopandas_io.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index fae710bc..2996da5e 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -180,9 +180,10 @@ def test_write_datetime_mixed_offset(tmp_path): df = gp.GeoDataFrame( {"dates": ser_localised, "geometry": [Point(1, 1), Point(1, 1)]} ) - write_dataframe(df, "foo.geojson") + fpath = tmp_path / "test.geojson" + write_dataframe(df, fpath) df_no_tz = read_dataframe( - "foo.geojson", datetime_as_string=False + fpath, datetime_as_string=False ) # TODO this shouldn't be called datetime as string in the pandas layer, # should it even be accessible? # datetime_as_string=False ignores tz info, returns datetime objects @@ -190,7 +191,7 @@ def test_write_datetime_mixed_offset(tmp_path): assert_series_equal(expected, df_no_tz["dates"]) # datetime_as_string=True keeps tz info, but pandas can't handle multiple offsets # unless given a timezone to identify them with -> returned as strings - df_local = read_dataframe("foo.geojson", datetime_as_string=True) + df_local = read_dataframe(fpath, datetime_as_string=True) assert_series_equal(ser_localised.astype("object"), df_local["dates"]) From 3426fdc8b66a9c067b921595a4911d41482f447e Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 3 Jun 2023 15:36:23 +1000 Subject: [PATCH 20/51] cleanups --- pyogrio/_io.pyx | 15 ++++----------- pyogrio/geopandas.py | 21 ++------------------- pyogrio/raw.py | 6 +----- pyogrio/tests/conftest.py | 1 - 4 files changed, 7 insertions(+), 36 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index a3e2783a..7d6c578c 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -823,8 +823,9 @@ cdef get_features_by_fid( field_ogr_types = fields[:,1] field_data = [ np.empty(shape=(count, ), - - dtype=("object" if datetime_as_string and fields[field_index,3].startswith("datetime") else fields[field_index,3])) for field_index in range(n_fields) + dtype=("object" if datetime_as_string and fields[field_index,3].startswith("datetime") + else fields[field_index,3])) + for field_index in range(n_fields) ] field_data_view = [field_data[field_index][:] for field_index in range(n_fields)] @@ -1487,8 +1488,7 @@ cdef infer_field_types(list dtypes): def ogr_write( str path, str layer, str driver, geometry, fields, field_data, field_mask, str crs, str geometry_type, str encoding, object dataset_kwargs, - object layer_kwargs, #tz_offsets, - bint promote_to_multi=False, bint nan_as_null=True, + object layer_kwargs, bint promote_to_multi=False, bint nan_as_null=True, bint append=False, dataset_metadata=None, layer_metadata=None ): cdef const char *path_c = NULL @@ -1795,10 +1795,7 @@ def ogr_write( if np.isnat(field_value): OGR_F_SetFieldNull(ogr_feature, field_idx) else: - # TODO check if 0=unknown or 1=localtime is most appropriate - # tz = tz_offsets.get(fields[field_idx], 1) datetime = field_value.item() - # print("writing date", tz, datetime) OGR_F_SetFieldDateTimeEx( ogr_feature, field_idx, @@ -1815,11 +1812,7 @@ def ogr_write( if np.isnat(field_value): OGR_F_SetFieldNull(ogr_feature, field_idx) else: - # TODO check if 0=unknown or 1=localtime is most appropriate - # tz = tz_offsets.get(fields[field_idx], 1) datetime = field_value.astype("datetime64[ms]").item() - # print(f"writing date=({tz}), dt=({datetime})") - OGR_F_SetFieldDateTimeEx( ogr_feature, field_idx, diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 86eccba1..7b8b3f5f 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -23,8 +23,6 @@ def _stringify_path(path): def _try_parse_datetime(ser): import pandas as pd # only called in a block where pandas is known to be installed - print("got series", ser, ser.to_dict()) - try: return pd.to_datetime(ser, format="ISO8601") except ValueError: @@ -48,7 +46,6 @@ def _try_parse_datetime(ser): except ValueError: pass warnings.warn(f"Datetime parsing failed for column {ser.name!r}") - print("---------------failed parsing") return ser @@ -181,7 +178,6 @@ def read_dataframe( read_func = read_arrow if use_arrow else read if not use_arrow and "datetime_as_string" not in kwargs: kwargs["datetime_as_string"] = True - # TODO work out what to do with the arrow api result = read_func( path_or_buffer, layer=layer, @@ -201,7 +197,6 @@ def read_dataframe( ) if use_arrow: - # TODO what happens to datetimes with arrow? meta, table = result df = table.to_pandas() geometry_name = meta["geometry_name"] or "wkb_geometry" @@ -365,23 +360,12 @@ def write_dataframe( # TODO: may need to fill in pd.NA, etc field_data = [] field_mask = [] - # dtypes = df[fields].dtypes().to_dict() - # tz_offsets = {} # gdal format timezone offsets by column for name in fields: ser = df[name] col = ser.values if isinstance(ser.dtype, pd.DatetimeTZDtype): - if len(ser) > 0: - # Assume series has same offset for all rows (enforced by pandas) - # tz_offsets[name] = ( - # # gdal stores tz in 15 minute increments offset from 100=GMT - # int(ser.iloc[0].utcoffset() / datetime.timedelta(minutes=15)) - # + 100 - # ) - # Convert to naive datetime, offset passed through via dtypes - # col = ser.dt.tz_localize(None).values - col = ser.astype(str) - # edge case of length zero ignored, don't have a reference date + # Deal with datetimes with timezones as strings + col = ser.astype(str) if isinstance(col, pd.api.extensions.ExtensionArray): from pandas.arrays import IntegerArray, FloatingArray, BooleanArray @@ -482,6 +466,5 @@ def write_dataframe( metadata=metadata, dataset_options=dataset_options, layer_options=layer_options, - # tz_offsets=tz_offsets, **kwargs, ) diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 0a9c746b..1d9eade8 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -111,7 +111,7 @@ def read( If True, will return the FIDs of the feature that were read. datetime_as_string : bool, optional (default: False) If True, will return datetime dtypes as detected by GDAL as a string - array, instead of a datetime64 array. + array, instead of a datetime64 array (used to extract timezone info). **kwargs Additional driver-specific dataset open options passed to OGR. Invalid @@ -391,7 +391,6 @@ def write( metadata=None, dataset_options=None, layer_options=None, - # tz_offsets=None, **kwargs, ): kwargs.pop("dtypes", None) @@ -441,8 +440,6 @@ def write( "projection information defined and may not be usable in other " "systems." ) - # if tz_offsets is None: - # tz_offsets = {} # preprocess kwargs and split in dataset and layer creation options dataset_kwargs = _preprocess_options_key_value(dataset_options or {}) @@ -481,5 +478,4 @@ def write( layer_metadata=layer_metadata, dataset_kwargs=dataset_kwargs, layer_kwargs=layer_kwargs, - # tz_offsets=tz_offsets, ) diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py index ef47fc96..8583e6d7 100644 --- a/pyogrio/tests/conftest.py +++ b/pyogrio/tests/conftest.py @@ -43,7 +43,6 @@ def prepare_testfile(testfile_path, dst_dir, ext): # For .gpkg, spatial_index=False to avoid the rows being reordered meta["spatial_index"] = False meta["geometry_type"] = "MultiPolygon" - meta.pop("dtypes") write(dst_path, geometry, field_data, **meta) return dst_path From bb6fd4ecb27de81a860d54b4ddc7ccb94991f4a7 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 3 Jun 2023 15:43:02 +1000 Subject: [PATCH 21/51] try cleanup datetime parsing --- pyogrio/geopandas.py | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 7b8b3f5f..e453d914 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -1,8 +1,8 @@ -import warnings import numpy as np from pyogrio.raw import DRIVERS_NO_MIXED_SINGLE_MULTI, DRIVERS_NO_MIXED_DIMENSIONS from pyogrio.raw import detect_driver, read, read_arrow, write from pyogrio.errors import DataSourceError +from packaging.version import Version def _stringify_path(path): @@ -23,30 +23,10 @@ def _stringify_path(path): def _try_parse_datetime(ser): import pandas as pd # only called in a block where pandas is known to be installed - try: + if Version(pd.__version__) >= Version("2.0.0"): return pd.to_datetime(ser, format="ISO8601") - except ValueError: - pass - try: + else: return pd.to_datetime(ser, yearfirst=True) - except ValueError: - pass - - # TODO surely a better way of doing this - for fmt in [ - "%y/%m/%d %H:%M:%S", - "%y/%m/%d %H:%M:%S.%f", - "%y/%m/%d %H:%M:%S.%f%z", - "%y/%m/%dT%H:%M:%S", - "%y/%m/%dT%H:%M:%S.%f", - "%y/%m/%dT%H:%M:%S.%f%z", - ]: - try: - return pd.to_datetime(ser, format=fmt) - except ValueError: - pass - warnings.warn(f"Datetime parsing failed for column {ser.name!r}") - return ser def read_dataframe( From 87419ac4ca01f4be24350d08709921b90ae0a116 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 3 Jun 2023 16:04:59 +1000 Subject: [PATCH 22/51] more cleanup, realise we can get dt resolution --- pyogrio/_io.pyx | 6 +++--- pyogrio/geopandas.py | 5 ++++- pyogrio/tests/test_geopandas_io.py | 18 +++++++----------- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 7d6c578c..91a518ef 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -658,10 +658,10 @@ cdef process_fields( data[i] = bin_value[:ret_length] elif field_type == OFTDateTime or field_type == OFTDate: - # defer datetime parsing to pandas layer + if datetime_as_string: - value = get_string(OGR_F_GetFieldAsString(ogr_feature, field_index), encoding=encoding) - data[i] = value + # defer datetime parsing to user/ pandas layer + data[i] = get_string(OGR_F_GetFieldAsString(ogr_feature, field_index), encoding=encoding) else: success = OGR_F_GetFieldAsDateTimeEx( ogr_feature, field_index, &year, &month, &day, &hour, &minute, &fsecond, &timezone) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index e453d914..38cb0ba9 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -24,7 +24,10 @@ def _try_parse_datetime(ser): import pandas as pd # only called in a block where pandas is known to be installed if Version(pd.__version__) >= Version("2.0.0"): - return pd.to_datetime(ser, format="ISO8601") + res = pd.to_datetime(ser, format="ISO8601") + if res.dtype != "object": + res = res.dt.as_unit("ms") + return res else: return pd.to_datetime(ser, yearfirst=True) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 2996da5e..3213bad1 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -136,21 +136,17 @@ def test_read_layer_invalid(naturalearth_lowres_all_ext): @pytest.mark.filterwarnings("ignore: Measured") def test_read_datetime(test_fgdb_vsi): df = read_dataframe(test_fgdb_vsi, layer="test_lines", max_features=1) - # if Version(pd.__version__) >= Version("2.0.0"): - # # starting with pandas 2.0, it preserves the passed datetime resolution - # assert df.SURVEY_DAT.dtype.name == "datetime64[ms]" - # else: - # assert df.SURVEY_DAT.dtype.name == "datetime64[ns]" - # String reading breaks this dtype preservation - assert df.SURVEY_DAT.dtype.name == "datetime64[ns]" + if Version(pd.__version__) >= Version("2.0.0"): + # starting with pandas 2.0, it preserves the passed datetime resolution + assert df.SURVEY_DAT.dtype.name == "datetime64[ms]" + else: + assert df.SURVEY_DAT.dtype.name == "datetime64[ns]" def test_read_datetime_tz(test_datetime_tz, tmp_path): df = read_dataframe(test_datetime_tz) - assert ( - df.col.dtype.unit == "ns" - ) # tz aware does not support ms resolution, even in pandas 2 + assert df.col.dtype.unit == "ms" if Version(pd.__version__) >= Version("2.0.0"): format_ = "ISO8601" else: @@ -161,7 +157,7 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path): format=format_, ), name="col", - ) + ).dt.as_unit("ms") assert_series_equal(df.col, expected_dt_col) # test write and read round trips # TODO gpkg doesn't work here, at least for my local gdal, writes NaT From fc78bd93df6d72e8b63d10d64e7e73e2a7d03a31 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 3 Jun 2023 16:21:07 +1000 Subject: [PATCH 23/51] more careful pandas 1.5 compat --- pyogrio/tests/test_geopandas_io.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 3213bad1..1b9f467e 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -147,24 +147,22 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path): df = read_dataframe(test_datetime_tz) assert df.col.dtype.unit == "ms" + raw_expected = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] + if Version(pd.__version__) >= Version("2.0.0"): - format_ = "ISO8601" + + expected = pd.to_datetime(raw_expected, format="ISO8601").as_unit("ms") else: - format_ = None - expected_dt_col = pd.Series( - pd.to_datetime( - ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"], - format=format_, - ), - name="col", - ).dt.as_unit("ms") - assert_series_equal(df.col, expected_dt_col) + expected = pd.to_datetime(raw_expected) + expected = pd.Series(expected, name="col") + + assert_series_equal(df.col, expected) # test write and read round trips # TODO gpkg doesn't work here, at least for my local gdal, writes NaT fpath = tmp_path / "test.geojson" write_dataframe(df, fpath) df_read = read_dataframe(fpath) - assert_series_equal(df_read.col, expected_dt_col) + assert_series_equal(df_read.col, expected) def test_write_datetime_mixed_offset(tmp_path): From 5fab3480c531fa771aae62843787f28f9fc2cb27 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 3 Jun 2023 16:25:21 +1000 Subject: [PATCH 24/51] delete line --- pyogrio/tests/test_geopandas_io.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 1b9f467e..936a583e 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -145,12 +145,9 @@ def test_read_datetime(test_fgdb_vsi): def test_read_datetime_tz(test_datetime_tz, tmp_path): df = read_dataframe(test_datetime_tz) - - assert df.col.dtype.unit == "ms" raw_expected = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] if Version(pd.__version__) >= Version("2.0.0"): - expected = pd.to_datetime(raw_expected, format="ISO8601").as_unit("ms") else: expected = pd.to_datetime(raw_expected) From 26c403adfb6cca70421a34e0d9e488407b4decef Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Tue, 8 Aug 2023 20:20:15 +1000 Subject: [PATCH 25/51] replace write support with working datetime object solution --- pyogrio/_io.pyx | 22 ++++++++++++++++++++-- pyogrio/geopandas.py | 17 ++++++++++++++--- pyogrio/raw.py | 2 ++ pyogrio/tests/test_geopandas_io.py | 16 ++++------------ 4 files changed, 40 insertions(+), 17 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 91a518ef..39b5a7a1 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1484,12 +1484,22 @@ cdef infer_field_types(list dtypes): return field_types +FIFTEEN_MINUTE_DELTA = datetime.timedelta(minutes=15) + +cdef int timezone_to_gdal_offset(tz_as_datetime): + """Convert to GDAL timezone offset representation. + + https://gdal.org/development/rfc/rfc56_millisecond_precision.html#core-changes + """ + return tz_as_datetime.utcoffset() / FIFTEEN_MINUTE_DELTA + 100 + # TODO: set geometry and field data as memory views? def ogr_write( str path, str layer, str driver, geometry, fields, field_data, field_mask, str crs, str geometry_type, str encoding, object dataset_kwargs, object layer_kwargs, bint promote_to_multi=False, bint nan_as_null=True, - bint append=False, dataset_metadata=None, layer_metadata=None + bint append=False, dataset_metadata=None, layer_metadata=None, + timezone_cols_metadata=None ): cdef const char *path_c = NULL cdef const char *layer_c = NULL @@ -1542,6 +1552,9 @@ def ogr_write( if not layer: layer = os.path.splitext(os.path.split(path)[1])[0] + if timezone_cols_metadata is None: + timezone_cols_metadata = {} + # if shapefile, GeoJSON, or FlatGeobuf, always delete first # for other types, check if we can create layers @@ -1813,6 +1826,11 @@ def ogr_write( OGR_F_SetFieldNull(ogr_feature, field_idx) else: datetime = field_value.astype("datetime64[ms]").item() + tz_array = timezone_cols_metadata.get(fields[field_idx], None) + if tz_array is None: + gdal_tz = 0 + else: + gdal_tz = timezone_to_gdal_offset(tz_array[i]) OGR_F_SetFieldDateTimeEx( ogr_feature, field_idx, @@ -1822,7 +1840,7 @@ def ogr_write( datetime.hour, datetime.minute, datetime.second + datetime.microsecond / 10**6, - 0 + gdal_tz ) else: diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 38cb0ba9..fa4b9eb0 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -343,13 +343,24 @@ def write_dataframe( # TODO: may need to fill in pd.NA, etc field_data = [] field_mask = [] + timezone_cols_metadata = ( + {} + ) # dict[str,np.array(datetime.datetime)] special case for dt-tz fields for name in fields: ser = df[name] col = ser.values if isinstance(ser.dtype, pd.DatetimeTZDtype): - # Deal with datetimes with timezones as strings - col = ser.astype(str) - + # Deal with datetimes with timezones by passing down timezone separately + # pass down naive datetime + col = ser.dt.tz_localize(None).values + # pandas only supports a single offset per column + # access via array since we want a numpy array not a series + # (only care about the utc offset, not actually the date) + # but ser.array.timetz won't have valid utc offset for pytz time zones + # (per https://docs.python.org/3/library/datetime.html#datetime.time.utcoffset) # noqa + timezone_cols_metadata[name] = ser.array.to_pydatetime() + else: + col = ser.values if isinstance(col, pd.api.extensions.ExtensionArray): from pandas.arrays import IntegerArray, FloatingArray, BooleanArray diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 1d9eade8..9128243e 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -391,6 +391,7 @@ def write( metadata=None, dataset_options=None, layer_options=None, + timezone_cols_metadata=None, **kwargs, ): kwargs.pop("dtypes", None) @@ -478,4 +479,5 @@ def write( layer_metadata=layer_metadata, dataset_kwargs=dataset_kwargs, layer_kwargs=layer_kwargs, + timezone_cols_metadata=timezone_cols_metadata, ) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 936a583e..46b61abd 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -155,8 +155,7 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path): assert_series_equal(df.col, expected) # test write and read round trips - # TODO gpkg doesn't work here, at least for my local gdal, writes NaT - fpath = tmp_path / "test.geojson" + fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath) df_read = read_dataframe(fpath) assert_series_equal(df_read.col, expected) @@ -171,18 +170,11 @@ def test_write_datetime_mixed_offset(tmp_path): df = gp.GeoDataFrame( {"dates": ser_localised, "geometry": [Point(1, 1), Point(1, 1)]} ) - fpath = tmp_path / "test.geojson" + fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath) - df_no_tz = read_dataframe( - fpath, datetime_as_string=False - ) # TODO this shouldn't be called datetime as string in the pandas layer, - # should it even be accessible? - # datetime_as_string=False ignores tz info, returns datetime objects - expected = ser_naive.astype("datetime64[ms]") - assert_series_equal(expected, df_no_tz["dates"]) - # datetime_as_string=True keeps tz info, but pandas can't handle multiple offsets - # unless given a timezone to identify them with -> returned as strings df_local = read_dataframe(fpath, datetime_as_string=True) + # GDAL tz only encodes offsets, not timezones so expect object dtype + # as pandas only supports datetimes for fixed offset / identified timezones assert_series_equal(ser_localised.astype("object"), df_local["dates"]) From ebdb71b39fc0db7b5f3987847526be217b03dad9 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Tue, 8 Aug 2023 22:17:42 +1000 Subject: [PATCH 26/51] fixes --- pyogrio/geopandas.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index fa4b9eb0..d6224534 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -343,9 +343,8 @@ def write_dataframe( # TODO: may need to fill in pd.NA, etc field_data = [] field_mask = [] - timezone_cols_metadata = ( - {} - ) # dict[str,np.array(datetime.datetime)] special case for dt-tz fields + # dict[str, np.array(datetime.datetime)] special case for dt-tz fields + timezone_cols_metadata = {} for name in fields: ser = df[name] col = ser.values @@ -460,5 +459,6 @@ def write_dataframe( metadata=metadata, dataset_options=dataset_options, layer_options=layer_options, + timezone_cols_metadata=timezone_cols_metadata, **kwargs, ) From f46e716cdee60f050de71c652e8b06a243a748c8 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Tue, 8 Aug 2023 22:22:16 +1000 Subject: [PATCH 27/51] rewrite datetime reading to handle mixed offset to utc --- pyogrio/geopandas.py | 22 ++++++++++++++++------ pyogrio/tests/test_geopandas_io.py | 10 +++++++--- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index d6224534..a4dd9550 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -23,13 +23,23 @@ def _stringify_path(path): def _try_parse_datetime(ser): import pandas as pd # only called in a block where pandas is known to be installed - if Version(pd.__version__) >= Version("2.0.0"): - res = pd.to_datetime(ser, format="ISO8601") - if res.dtype != "object": - res = res.dt.as_unit("ms") - return res + pandas_ge_20 = Version(pd.__version__) >= Version("2.0.0") + + if pandas_ge_20: + datetime_kwargs = dict(format="ISO8601", errors="ignore") else: - return pd.to_datetime(ser, yearfirst=True) + datetime_kwargs = dict(yearfirst=True) + res = pd.to_datetime(ser, **datetime_kwargs) + # if object dtype, try parse as utc instead + if res.dtype == "object": + res = pd.to_datetime(ser, utc=True, **datetime_kwargs) + + if res.dtype != "object": + if pandas_ge_20: + res = res.dt.as_unit("ms") + else: + res = ser.dt.round(freq="ms") + return res def read_dataframe( diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 46b61abd..1f8f7e6f 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -167,15 +167,19 @@ def test_write_datetime_mixed_offset(tmp_path): tz = pytz.timezone("Australia/Sydney") ser_naive = pd.Series(pd.to_datetime(dates), name="dates") ser_localised = ser_naive.dt.tz_localize(tz) + ser_utc = ser_localised.dt.tz_convert(pytz.UTC) + if Version(pd.__version__) >= Version("2.0.0"): + ser_utc = ser_utc.dt.as_unit("ms") + df = gp.GeoDataFrame( {"dates": ser_localised, "geometry": [Point(1, 1), Point(1, 1)]} ) fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath) df_local = read_dataframe(fpath, datetime_as_string=True) - # GDAL tz only encodes offsets, not timezones so expect object dtype - # as pandas only supports datetimes for fixed offset / identified timezones - assert_series_equal(ser_localised.astype("object"), df_local["dates"]) + # GDAL tz only encodes offsets, not timezones, for multiple offsets + # read as utc datetime as otherwise would be read as string + assert_series_equal(ser_utc, df_local["dates"]) def test_read_null_values(test_fgdb_vsi): From 44686f94dbfd752adb9997d7c0791d0405f9b22c Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Tue, 8 Aug 2023 22:50:14 +1000 Subject: [PATCH 28/51] fix nat handling for datetime as string --- pyogrio/_io.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 39b5a7a1..da86806b 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -632,7 +632,7 @@ cdef process_fields( else: data[i] = np.nan - elif field_type in ( OFTDate, OFTDateTime): + elif field_type in ( OFTDate, OFTDateTime) and not datetime_as_string: data[i] = np.datetime64('NaT') else: From 6b946f5bc38d3f495c33e4cdcf14f7258d9bace1 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Tue, 8 Aug 2023 22:52:50 +1000 Subject: [PATCH 29/51] don't expose datetime_as_string in pandas layer --- pyogrio/geopandas.py | 2 +- pyogrio/tests/test_geopandas_io.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index a4dd9550..d6d234fe 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -169,7 +169,7 @@ def read_dataframe( path_or_buffer = _stringify_path(path_or_buffer) read_func = read_arrow if use_arrow else read - if not use_arrow and "datetime_as_string" not in kwargs: + if not use_arrow: kwargs["datetime_as_string"] = True result = read_func( path_or_buffer, diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 1f8f7e6f..3a702e14 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -176,7 +176,7 @@ def test_write_datetime_mixed_offset(tmp_path): ) fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath) - df_local = read_dataframe(fpath, datetime_as_string=True) + df_local = read_dataframe(fpath) # GDAL tz only encodes offsets, not timezones, for multiple offsets # read as utc datetime as otherwise would be read as string assert_series_equal(ser_utc, df_local["dates"]) From ec16ed3cad0a21e9c71a37d9b16fb90014e0a178 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Tue, 8 Aug 2023 23:31:54 +1000 Subject: [PATCH 30/51] incorrect variable in 1.5.3 compat --- pyogrio/geopandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index d6d234fe..2e4133c7 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -38,7 +38,7 @@ def _try_parse_datetime(ser): if pandas_ge_20: res = res.dt.as_unit("ms") else: - res = ser.dt.round(freq="ms") + res = res.dt.round(freq="ms") return res From da0639a495175865b53e6ba524cb1094cb11cc3e Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Wed, 9 Aug 2023 20:14:07 +1000 Subject: [PATCH 31/51] CLN: tidy up pandas 2.0 compat --- pyogrio/geopandas.py | 17 ++++++++++++----- pyogrio/tests/test_geopandas_io.py | 9 ++++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 2e4133c7..65ae8437 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -5,6 +5,15 @@ from packaging.version import Version +try: + import pandas + + PANDAS_GE_20 = Version(pandas.__version__) >= Version("2.0.0") + +except ImportError: + PANDAS_GE_20 = None + + def _stringify_path(path): """ Convert path-like to a string if possible, pass-through other objects @@ -21,11 +30,9 @@ def _stringify_path(path): def _try_parse_datetime(ser): - import pandas as pd # only called in a block where pandas is known to be installed - - pandas_ge_20 = Version(pd.__version__) >= Version("2.0.0") + import pandas as pd # only called when pandas is known to be installed - if pandas_ge_20: + if PANDAS_GE_20: datetime_kwargs = dict(format="ISO8601", errors="ignore") else: datetime_kwargs = dict(yearfirst=True) @@ -35,7 +42,7 @@ def _try_parse_datetime(ser): res = pd.to_datetime(ser, utc=True, **datetime_kwargs) if res.dtype != "object": - if pandas_ge_20: + if PANDAS_GE_20: res = res.dt.as_unit("ms") else: res = res.dt.round(freq="ms") diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 3a702e14..b25905ea 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1,13 +1,12 @@ import contextlib from datetime import datetime import os -from packaging.version import Version import numpy as np import pytest from pyogrio import list_layers, read_info, __gdal_version__, __gdal_geos_version__ from pyogrio.errors import DataLayerError, DataSourceError, FeatureError, GeometryError -from pyogrio.geopandas import read_dataframe, write_dataframe +from pyogrio.geopandas import read_dataframe, write_dataframe, PANDAS_GE_20 from pyogrio.raw import ( DRIVERS, DRIVERS_NO_MIXED_DIMENSIONS, @@ -136,7 +135,7 @@ def test_read_layer_invalid(naturalearth_lowres_all_ext): @pytest.mark.filterwarnings("ignore: Measured") def test_read_datetime(test_fgdb_vsi): df = read_dataframe(test_fgdb_vsi, layer="test_lines", max_features=1) - if Version(pd.__version__) >= Version("2.0.0"): + if PANDAS_GE_20: # starting with pandas 2.0, it preserves the passed datetime resolution assert df.SURVEY_DAT.dtype.name == "datetime64[ms]" else: @@ -147,7 +146,7 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path): df = read_dataframe(test_datetime_tz) raw_expected = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] - if Version(pd.__version__) >= Version("2.0.0"): + if PANDAS_GE_20: expected = pd.to_datetime(raw_expected, format="ISO8601").as_unit("ms") else: expected = pd.to_datetime(raw_expected) @@ -168,7 +167,7 @@ def test_write_datetime_mixed_offset(tmp_path): ser_naive = pd.Series(pd.to_datetime(dates), name="dates") ser_localised = ser_naive.dt.tz_localize(tz) ser_utc = ser_localised.dt.tz_convert(pytz.UTC) - if Version(pd.__version__) >= Version("2.0.0"): + if PANDAS_GE_20: ser_utc = ser_utc.dt.as_unit("ms") df = gp.GeoDataFrame( From 85a67c240a38dfb0f453cc87f852ee11f94fa8db Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sun, 24 Sep 2023 22:40:59 +1000 Subject: [PATCH 32/51] suggested alternative implementation --- pyogrio/_io.pyx | 11 +---------- pyogrio/geopandas.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index da86806b..341767fd 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1484,15 +1484,6 @@ cdef infer_field_types(list dtypes): return field_types -FIFTEEN_MINUTE_DELTA = datetime.timedelta(minutes=15) - -cdef int timezone_to_gdal_offset(tz_as_datetime): - """Convert to GDAL timezone offset representation. - - https://gdal.org/development/rfc/rfc56_millisecond_precision.html#core-changes - """ - return tz_as_datetime.utcoffset() / FIFTEEN_MINUTE_DELTA + 100 - # TODO: set geometry and field data as memory views? def ogr_write( str path, str layer, str driver, geometry, fields, field_data, field_mask, @@ -1830,7 +1821,7 @@ def ogr_write( if tz_array is None: gdal_tz = 0 else: - gdal_tz = timezone_to_gdal_offset(tz_array[i]) + gdal_tz = tz_array[i] OGR_F_SetFieldDateTimeEx( ogr_feature, field_idx, diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 65ae8437..94e47989 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -368,13 +368,14 @@ def write_dataframe( if isinstance(ser.dtype, pd.DatetimeTZDtype): # Deal with datetimes with timezones by passing down timezone separately # pass down naive datetime - col = ser.dt.tz_localize(None).values - # pandas only supports a single offset per column - # access via array since we want a numpy array not a series - # (only care about the utc offset, not actually the date) - # but ser.array.timetz won't have valid utc offset for pytz time zones - # (per https://docs.python.org/3/library/datetime.html#datetime.time.utcoffset) # noqa - timezone_cols_metadata[name] = ser.array.to_pydatetime() + naive = ser.dt.tz_localize(None) + col = naive.values + # compute offset relative to UTC explicitly + tz_offset = naive - ser.dt.tz_convert("UTC").dt.tz_localize(None) + # Convert to GDAL timezone offset representation. + # https://gdal.org/development/rfc/rfc56_millisecond_precision.html#core-changes + gdal_offset_representation = tz_offset // pd.Timedelta("15m") + 100 + timezone_cols_metadata[name] = gdal_offset_representation else: col = ser.values if isinstance(col, pd.api.extensions.ExtensionArray): From d96d67e02b80eae90fa48b37ec2d00988ce20488 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sun, 24 Sep 2023 22:47:35 +1000 Subject: [PATCH 33/51] code review suggestion --- pyogrio/tests/test_geopandas_io.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index b25905ea..d6e8518f 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -21,7 +21,6 @@ assert_index_equal, assert_series_equal, ) - import pytz import geopandas as gp from geopandas.array import from_wkt @@ -154,7 +153,7 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path): assert_series_equal(df.col, expected) # test write and read round trips - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / "test.geojson" write_dataframe(df, fpath) df_read = read_dataframe(fpath) assert_series_equal(df_read.col, expected) @@ -163,17 +162,16 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path): def test_write_datetime_mixed_offset(tmp_path): # Summer Time (GMT+11), standard time (GMT+10) dates = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] - tz = pytz.timezone("Australia/Sydney") ser_naive = pd.Series(pd.to_datetime(dates), name="dates") - ser_localised = ser_naive.dt.tz_localize(tz) - ser_utc = ser_localised.dt.tz_convert(pytz.UTC) + ser_localised = ser_naive.dt.tz_localize("Australia/Sydney") + ser_utc = ser_localised.dt.tz_convert("UTC") if PANDAS_GE_20: ser_utc = ser_utc.dt.as_unit("ms") df = gp.GeoDataFrame( {"dates": ser_localised, "geometry": [Point(1, 1), Point(1, 1)]} ) - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / "test.geojson" write_dataframe(df, fpath) df_local = read_dataframe(fpath) # GDAL tz only encodes offsets, not timezones, for multiple offsets From 3eb70dc19c5d8813253cea288a2c3c1596052e5f Mon Sep 17 00:00:00 2001 From: Matt Richards <45483497+m-richards@users.noreply.github.com> Date: Sun, 24 Sep 2023 22:50:15 +1000 Subject: [PATCH 34/51] Update pyogrio/tests/test_geopandas_io.py Co-authored-by: Joris Van den Bossche --- pyogrio/tests/test_geopandas_io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index d6e8518f..770e76dd 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -173,10 +173,10 @@ def test_write_datetime_mixed_offset(tmp_path): ) fpath = tmp_path / "test.geojson" write_dataframe(df, fpath) - df_local = read_dataframe(fpath) + result = read_dataframe(fpath) # GDAL tz only encodes offsets, not timezones, for multiple offsets # read as utc datetime as otherwise would be read as string - assert_series_equal(ser_utc, df_local["dates"]) + assert_series_equal(result["date"], ser_utc) def test_read_null_values(test_fgdb_vsi): From 3df12c0f6a77f42b345a8b55827f8c8f9f7e28d4 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Thu, 28 Sep 2023 21:43:31 +1000 Subject: [PATCH 35/51] time tests and suggestions --- pyogrio/_io.pyx | 3 ++- .../tests/fixtures/test_datetime_tz.geojson | 1 + pyogrio/tests/test_geopandas_io.py | 27 +++++++++++++++---- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 4ce44a46..5ffb1f49 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -761,7 +761,8 @@ cdef process_fields( elif field_type == OFTDateTime or field_type == OFTDate: if datetime_as_string: - # defer datetime parsing to user/ pandas layer + # defer datetime parsing to user/ pandas layer + # Update to OGR_F_GetFieldAsISO8601DateTime when GDAL 3.7+ only data[i] = get_string(OGR_F_GetFieldAsString(ogr_feature, field_index), encoding=encoding) else: success = OGR_F_GetFieldAsDateTimeEx( diff --git a/pyogrio/tests/fixtures/test_datetime_tz.geojson b/pyogrio/tests/fixtures/test_datetime_tz.geojson index 2fb72759..078a61cb 100644 --- a/pyogrio/tests/fixtures/test_datetime_tz.geojson +++ b/pyogrio/tests/fixtures/test_datetime_tz.geojson @@ -1,5 +1,6 @@ { "type": "FeatureCollection", +"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } }, "features": [ { "type": "Feature", "properties": { "col": "2020-01-01T09:00:00.123-05:00" }, "geometry": { "type": "Point", "coordinates": [ 1.0, 1.0 ] } }, { "type": "Feature", "properties": { "col": "2020-01-01T10:00:00-05:00" }, "geometry": { "type": "Point", "coordinates": [ 2.0, 2.0 ] } } diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 9d4c5f60..ae4f0780 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -188,10 +188,9 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path): else: expected = pd.to_datetime(raw_expected) expected = pd.Series(expected, name="col") - assert_series_equal(df.col, expected) # test write and read round trips - fpath = tmp_path / "test.geojson" + fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath) df_read = read_dataframe(fpath) assert_series_equal(df_read.col, expected) @@ -207,14 +206,32 @@ def test_write_datetime_mixed_offset(tmp_path): ser_utc = ser_utc.dt.as_unit("ms") df = gp.GeoDataFrame( - {"dates": ser_localised, "geometry": [Point(1, 1), Point(1, 1)]} + {"dates": ser_localised, "geometry": [Point(1, 1), Point(1, 1)]}, + crs="EPSG:4326", ) - fpath = tmp_path / "test.geojson" + fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath) result = read_dataframe(fpath) # GDAL tz only encodes offsets, not timezones, for multiple offsets # read as utc datetime as otherwise would be read as string - assert_series_equal(result["date"], ser_utc) + assert_series_equal(result["dates"], ser_utc) + + +def test_read_write_datetime_tz_with_nulls(tmp_path): + dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", pd.NaT] + if PANDAS_GE_20: + dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + dates = pd.to_datetime(dates_raw) + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, + crs="EPSG:4326", + ) + fpath = tmp_path / "test.geojson" + write_dataframe(df, fpath) + breakpoint() + result = read_dataframe(fpath) + assert_geodataframe_equal(df, result) def test_read_null_values(test_fgdb_vsi): From 8fd30a5ca45ef02d335f4202b2fc5cb56531b1b2 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Thu, 28 Sep 2023 21:46:05 +1000 Subject: [PATCH 36/51] remove breakpoint --- pyogrio/tests/test_geopandas_io.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index ae4f0780..1832483a 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -229,7 +229,6 @@ def test_read_write_datetime_tz_with_nulls(tmp_path): ) fpath = tmp_path / "test.geojson" write_dataframe(df, fpath) - breakpoint() result = read_dataframe(fpath) assert_geodataframe_equal(df, result) From 55293c0765b9259b80a12ef8bdef4527f95bf1bd Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 30 Sep 2023 12:42:53 +1000 Subject: [PATCH 37/51] catch warning --- pyogrio/geopandas.py | 9 ++++++++- pyogrio/tests/test_geopandas_io.py | 10 +++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 6c9cd219..c024cb0e 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -11,6 +11,7 @@ ) from pyogrio.errors import DataSourceError from packaging.version import Version +import warnings try: @@ -44,7 +45,13 @@ def _try_parse_datetime(ser): datetime_kwargs = dict(format="ISO8601", errors="ignore") else: datetime_kwargs = dict(yearfirst=True) - res = pd.to_datetime(ser, **datetime_kwargs) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + ".*parsing datetimes with mixed time zones will raise.*", + FutureWarning, + ) + res = pd.to_datetime(ser, **datetime_kwargs) # if object dtype, try parse as utc instead if res.dtype == "object": res = pd.to_datetime(ser, utc=True, **datetime_kwargs) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 1832483a..bf44ce38 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -190,7 +190,7 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path): expected = pd.Series(expected, name="col") assert_series_equal(df.col, expected) # test write and read round trips - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / "test.geojson" write_dataframe(df, fpath) df_read = read_dataframe(fpath) assert_series_equal(df_read.col, expected) @@ -209,7 +209,7 @@ def test_write_datetime_mixed_offset(tmp_path): {"dates": ser_localised, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326", ) - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / "test.geojson" write_dataframe(df, fpath) result = read_dataframe(fpath) # GDAL tz only encodes offsets, not timezones, for multiple offsets @@ -650,7 +650,7 @@ def test_write_read_empty_dataframe_unsupported(tmp_path, ext): def test_write_dataframe_gpkg_multiple_layers(tmp_path, naturalearth_lowres): input_gdf = read_dataframe(naturalearth_lowres) - output_path = tmp_path / "test.gpkg" + output_path = tmp_path / "test.geojson" write_dataframe(input_gdf, output_path, layer="first", promote_to_multi=True) @@ -1201,7 +1201,7 @@ def test_metadata_io(tmpdir, naturalearth_lowres, metadata_type): df = read_dataframe(naturalearth_lowres) - filename = os.path.join(str(tmpdir), "test.gpkg") + filename = os.path.join(str(tmpdir), "test.geojson") write_dataframe(df, filename, **{metadata_type: metadata}) metadata_key = "layer_metadata" if metadata_type == "metadata" else metadata_type @@ -1220,7 +1220,7 @@ def test_metadata_io(tmpdir, naturalearth_lowres, metadata_type): ) def test_invalid_metadata(tmpdir, naturalearth_lowres, metadata_type, metadata): with pytest.raises(ValueError, match="must be a string"): - filename = os.path.join(str(tmpdir), "test.gpkg") + filename = os.path.join(str(tmpdir), "test.geojson") write_dataframe( read_dataframe(naturalearth_lowres), filename, **{metadata_type: metadata} ) From 8040c21a836e08f00f8a30cfb3c26c276584ce86 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 30 Sep 2023 13:25:37 +1000 Subject: [PATCH 38/51] really need to fix my local gdal --- pyogrio/tests/test_geopandas_io.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index bf44ce38..dc48e535 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -190,7 +190,7 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path): expected = pd.Series(expected, name="col") assert_series_equal(df.col, expected) # test write and read round trips - fpath = tmp_path / "test.geojson" + fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath) df_read = read_dataframe(fpath) assert_series_equal(df_read.col, expected) @@ -209,7 +209,7 @@ def test_write_datetime_mixed_offset(tmp_path): {"dates": ser_localised, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326", ) - fpath = tmp_path / "test.geojson" + fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath) result = read_dataframe(fpath) # GDAL tz only encodes offsets, not timezones, for multiple offsets @@ -227,7 +227,7 @@ def test_read_write_datetime_tz_with_nulls(tmp_path): {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, crs="EPSG:4326", ) - fpath = tmp_path / "test.geojson" + fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath) result = read_dataframe(fpath) assert_geodataframe_equal(df, result) @@ -650,7 +650,7 @@ def test_write_read_empty_dataframe_unsupported(tmp_path, ext): def test_write_dataframe_gpkg_multiple_layers(tmp_path, naturalearth_lowres): input_gdf = read_dataframe(naturalearth_lowres) - output_path = tmp_path / "test.geojson" + output_path = tmp_path / "test.gpkg" write_dataframe(input_gdf, output_path, layer="first", promote_to_multi=True) @@ -706,7 +706,7 @@ def test_write_dataframe_gdal_options_unknown(tmp_path, naturalearth_lowres): df = read_dataframe(naturalearth_lowres) # geojson has no spatial index, so passing keyword should raise - outfilename = tmp_path / "test.geojson" + outfilename = tmp_path / "test.gpkg" with pytest.raises(ValueError, match="unrecognized option 'SPATIAL_INDEX'"): write_dataframe(df, outfilename, spatial_index=True) @@ -864,7 +864,7 @@ def test_write_dataframe_promote_to_multi_layer_geom_type_invalid( def test_write_dataframe_layer_geom_type_invalid(tmp_path, naturalearth_lowres): df = read_dataframe(naturalearth_lowres) - filename = tmp_path / "test.geojson" + filename = tmp_path / "test.gpkg" with pytest.raises( GeometryError, match="Geometry type is not supported: NotSupported" ): @@ -1201,7 +1201,7 @@ def test_metadata_io(tmpdir, naturalearth_lowres, metadata_type): df = read_dataframe(naturalearth_lowres) - filename = os.path.join(str(tmpdir), "test.geojson") + filename = os.path.join(str(tmpdir), "test.gpkg") write_dataframe(df, filename, **{metadata_type: metadata}) metadata_key = "layer_metadata" if metadata_type == "metadata" else metadata_type @@ -1220,7 +1220,7 @@ def test_metadata_io(tmpdir, naturalearth_lowres, metadata_type): ) def test_invalid_metadata(tmpdir, naturalearth_lowres, metadata_type, metadata): with pytest.raises(ValueError, match="must be a string"): - filename = os.path.join(str(tmpdir), "test.geojson") + filename = os.path.join(str(tmpdir), "test.gpkg") write_dataframe( read_dataframe(naturalearth_lowres), filename, **{metadata_type: metadata} ) @@ -1230,7 +1230,7 @@ def test_invalid_metadata(tmpdir, naturalearth_lowres, metadata_type, metadata): def test_metadata_unsupported(tmpdir, naturalearth_lowres, metadata_type): """metadata is silently ignored""" - filename = os.path.join(str(tmpdir), "test.geojson") + filename = os.path.join(str(tmpdir), "test.gpkg") write_dataframe( read_dataframe(naturalearth_lowres), filename, From fccc8fb680ac4d7f3349bee30cf2c6a4ebe9a30c Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 30 Sep 2023 18:19:18 +1000 Subject: [PATCH 39/51] fix fix --- pyogrio/tests/test_geopandas_io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index dc48e535..aeca6563 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -706,7 +706,7 @@ def test_write_dataframe_gdal_options_unknown(tmp_path, naturalearth_lowres): df = read_dataframe(naturalearth_lowres) # geojson has no spatial index, so passing keyword should raise - outfilename = tmp_path / "test.gpkg" + outfilename = tmp_path / "test.geosjon" with pytest.raises(ValueError, match="unrecognized option 'SPATIAL_INDEX'"): write_dataframe(df, outfilename, spatial_index=True) @@ -864,7 +864,7 @@ def test_write_dataframe_promote_to_multi_layer_geom_type_invalid( def test_write_dataframe_layer_geom_type_invalid(tmp_path, naturalearth_lowres): df = read_dataframe(naturalearth_lowres) - filename = tmp_path / "test.gpkg" + filename = tmp_path / "test.geosjon" with pytest.raises( GeometryError, match="Geometry type is not supported: NotSupported" ): @@ -1230,7 +1230,7 @@ def test_invalid_metadata(tmpdir, naturalearth_lowres, metadata_type, metadata): def test_metadata_unsupported(tmpdir, naturalearth_lowres, metadata_type): """metadata is silently ignored""" - filename = os.path.join(str(tmpdir), "test.gpkg") + filename = os.path.join(str(tmpdir), "test.geojson") write_dataframe( read_dataframe(naturalearth_lowres), filename, From 200cc1d74f573b22dadb637cdd2496ef97f1c442 Mon Sep 17 00:00:00 2001 From: Matt Richards <45483497+m-richards@users.noreply.github.com> Date: Sat, 30 Sep 2023 19:48:41 +1000 Subject: [PATCH 40/51] Apply suggestions from code review Co-authored-by: Joris Van den Bossche --- pyogrio/tests/test_geopandas_io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index aeca6563..96ec5354 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -706,7 +706,7 @@ def test_write_dataframe_gdal_options_unknown(tmp_path, naturalearth_lowres): df = read_dataframe(naturalearth_lowres) # geojson has no spatial index, so passing keyword should raise - outfilename = tmp_path / "test.geosjon" + outfilename = tmp_path / "test.geojson" with pytest.raises(ValueError, match="unrecognized option 'SPATIAL_INDEX'"): write_dataframe(df, outfilename, spatial_index=True) @@ -864,7 +864,7 @@ def test_write_dataframe_promote_to_multi_layer_geom_type_invalid( def test_write_dataframe_layer_geom_type_invalid(tmp_path, naturalearth_lowres): df = read_dataframe(naturalearth_lowres) - filename = tmp_path / "test.geosjon" + filename = tmp_path / "test.geojson" with pytest.raises( GeometryError, match="Geometry type is not supported: NotSupported" ): From ebfc01ce6fcf9475c418be35283bcaedbaaa17c6 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 30 Sep 2023 20:17:41 +1000 Subject: [PATCH 41/51] add suggested exception handling --- pyogrio/geopandas.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index c024cb0e..20f09ff3 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -51,7 +51,12 @@ def _try_parse_datetime(ser): ".*parsing datetimes with mixed time zones will raise.*", FutureWarning, ) - res = pd.to_datetime(ser, **datetime_kwargs) + # pre-emptive try catch for when pandas will raise + # (can tighten the exception type in future when it does) + try: + res = pd.to_datetime(ser, **datetime_kwargs) + except Exception: + pass # if object dtype, try parse as utc instead if res.dtype == "object": res = pd.to_datetime(ser, utc=True, **datetime_kwargs) From c8c186a6e25a8856e531dd9311d3cc32fbdae430 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 7 Oct 2023 15:38:07 +1100 Subject: [PATCH 42/51] move pandas compat to _compat --- pyogrio/_compat.py | 7 +++++++ pyogrio/geopandas.py | 12 +----------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/pyogrio/_compat.py b/pyogrio/_compat.py index 596abeb0..9e932f3f 100644 --- a/pyogrio/_compat.py +++ b/pyogrio/_compat.py @@ -18,11 +18,18 @@ except ImportError: geopandas = None +try: + import pandas +except ImportError: + pandas = None + HAS_ARROW_API = __gdal_version__ >= (3, 6, 0) and pyarrow is not None HAS_GEOPANDAS = geopandas is not None +PANDAS_GE_20 = pandas is not None and Version(pandas.__version__) >= Version("2.0.0") + HAS_GDAL_GEOS = __gdal_geos_version__ is not None HAS_SHAPELY = shapely is not None and Version(shapely.__version__) >= Version("2.0.0") diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 20f09ff3..2f9f24d4 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -1,6 +1,6 @@ import numpy as np -from pyogrio._compat import HAS_GEOPANDAS +from pyogrio._compat import HAS_GEOPANDAS, PANDAS_GE_20 from pyogrio.raw import ( DRIVERS_NO_MIXED_SINGLE_MULTI, DRIVERS_NO_MIXED_DIMENSIONS, @@ -10,19 +10,9 @@ write, ) from pyogrio.errors import DataSourceError -from packaging.version import Version import warnings -try: - import pandas - - PANDAS_GE_20 = Version(pandas.__version__) >= Version("2.0.0") - -except ImportError: - PANDAS_GE_20 = None - - def _stringify_path(path): """ Convert path-like to a string if possible, pass-through other objects From 95030c0aa927cd1bfac29176f6a2e9423048a9ed Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 7 Oct 2023 16:17:27 +1100 Subject: [PATCH 43/51] address review comments --- pyogrio/geopandas.py | 43 +++++++++++-------- pyogrio/raw.py | 6 ++- .../tests/fixtures/test_datetime_tz.geojson | 4 +- pyogrio/tests/test_geopandas_io.py | 24 +++++------ pyogrio/tests/test_raw_io.py | 2 +- 5 files changed, 46 insertions(+), 33 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 2f9f24d4..45ee5240 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -52,6 +52,9 @@ def _try_parse_datetime(ser): res = pd.to_datetime(ser, utc=True, **datetime_kwargs) if res.dtype != "object": + # GDAL only supports ms precision, convert outputs to match. + # Pandas 2.0 supports datetime[ms] directly, prior versions only support [ns], + # Instead, round the values to [ms] precision. if PANDAS_GE_20: res = res.dt.as_unit("ms") else: @@ -202,6 +205,9 @@ def read_dataframe( read_func = read_arrow if use_arrow else read if not use_arrow: + # For arrow, datetimes are read as is. + # For numpy IO, datetimes are read as string values to preserve timezone info + # as numpy does not directly support timezones. kwargs["datetime_as_string"] = True result = read_func( path_or_buffer, @@ -392,34 +398,37 @@ def write_dataframe( field_data = [] field_mask = [] # dict[str, np.array(datetime.datetime)] special case for dt-tz fields - timezone_cols_metadata = {} + gdal_tz_offsets = {} for name in fields: - ser = df[name] - col = ser.values - if isinstance(ser.dtype, pd.DatetimeTZDtype): + col = df[name] + values = col.values + if isinstance(col.dtype, pd.DatetimeTZDtype): # Deal with datetimes with timezones by passing down timezone separately # pass down naive datetime - naive = ser.dt.tz_localize(None) - col = naive.values + naive = col.dt.tz_localize(None) + values = naive.values # compute offset relative to UTC explicitly - tz_offset = naive - ser.dt.tz_convert("UTC").dt.tz_localize(None) + tz_offset = naive - col.dt.tz_convert("UTC").dt.tz_localize(None) # Convert to GDAL timezone offset representation. + # GMT is represented as 100 and offsets are represented by adding / + # subtracting 1 for every 15 minutes different from GMT. # https://gdal.org/development/rfc/rfc56_millisecond_precision.html#core-changes + # Convert each row offset to a signed multiple of 15m and add to GMT value gdal_offset_representation = tz_offset // pd.Timedelta("15m") + 100 - timezone_cols_metadata[name] = gdal_offset_representation + gdal_tz_offsets[name] = gdal_offset_representation else: - col = ser.values - if isinstance(col, pd.api.extensions.ExtensionArray): + values = col.values + if isinstance(values, pd.api.extensions.ExtensionArray): from pandas.arrays import IntegerArray, FloatingArray, BooleanArray - if isinstance(col, (IntegerArray, FloatingArray, BooleanArray)): - field_data.append(col._data) - field_mask.append(col._mask) + if isinstance(values, (IntegerArray, FloatingArray, BooleanArray)): + field_data.append(values._data) + field_mask.append(values._mask) else: - field_data.append(np.asarray(col)) - field_mask.append(np.asarray(col.isna())) + field_data.append(np.asarray(values)) + field_mask.append(np.asarray(values.isna())) else: - field_data.append(col) + field_data.append(values) field_mask.append(None) # Determine geometry_type and/or promote_to_multi @@ -514,6 +523,6 @@ def write_dataframe( metadata=metadata, dataset_options=dataset_options, layer_options=layer_options, - timezone_cols_metadata=timezone_cols_metadata, + timezone_cols_metadata=gdal_tz_offsets, **kwargs, ) diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 7b0b0cb5..bad2d28e 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -129,7 +129,8 @@ def read( If True, will return the FIDs of the feature that were read. datetime_as_string : bool, optional (default: False) If True, will return datetime dtypes as detected by GDAL as a string - array, instead of a datetime64 array (used to extract timezone info). + array (which can be used to extract timezone info), instead of + a datetime64 array. **kwargs Additional driver-specific dataset open options passed to OGR. Invalid @@ -148,6 +149,7 @@ def read( Meta is: { "crs": "", "fields": , + "dtypes": "encoding": "", "geometry_type": "" } @@ -386,6 +388,8 @@ def write( timezone_cols_metadata=None, **kwargs, ): + # if dtypes is given, remove it from kwargs (dtypes is included in meta returned by + # read, and it is convenient to pass meta directly into write for round trip tests) kwargs.pop("dtypes", None) path = vsi_path(str(path)) diff --git a/pyogrio/tests/fixtures/test_datetime_tz.geojson b/pyogrio/tests/fixtures/test_datetime_tz.geojson index 078a61cb..e6b39206 100644 --- a/pyogrio/tests/fixtures/test_datetime_tz.geojson +++ b/pyogrio/tests/fixtures/test_datetime_tz.geojson @@ -2,7 +2,7 @@ "type": "FeatureCollection", "crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } }, "features": [ -{ "type": "Feature", "properties": { "col": "2020-01-01T09:00:00.123-05:00" }, "geometry": { "type": "Point", "coordinates": [ 1.0, 1.0 ] } }, -{ "type": "Feature", "properties": { "col": "2020-01-01T10:00:00-05:00" }, "geometry": { "type": "Point", "coordinates": [ 2.0, 2.0 ] } } +{ "type": "Feature", "properties": { "datetime_col": "2020-01-01T09:00:00.123-05:00" }, "geometry": { "type": "Point", "coordinates": [ 1.0, 1.0 ] } }, +{ "type": "Feature", "properties": { "datetime_col": "2020-01-01T10:00:00-05:00" }, "geometry": { "type": "Point", "coordinates": [ 2.0, 2.0 ] } } ] } diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 96ec5354..3f6cf718 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -187,34 +187,34 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path): expected = pd.to_datetime(raw_expected, format="ISO8601").as_unit("ms") else: expected = pd.to_datetime(raw_expected) - expected = pd.Series(expected, name="col") - assert_series_equal(df.col, expected) + expected = pd.Series(expected, name="datetime_col") + assert_series_equal(df.datetime_col, expected) # test write and read round trips fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath) df_read = read_dataframe(fpath) - assert_series_equal(df_read.col, expected) + assert_series_equal(df_read.datetime_col, expected) def test_write_datetime_mixed_offset(tmp_path): - # Summer Time (GMT+11), standard time (GMT+10) + # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10) dates = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] - ser_naive = pd.Series(pd.to_datetime(dates), name="dates") - ser_localised = ser_naive.dt.tz_localize("Australia/Sydney") - ser_utc = ser_localised.dt.tz_convert("UTC") + naive_col = pd.Series(pd.to_datetime(dates), name="dates") + localised_col = naive_col.dt.tz_localize("Australia/Sydney") + utc_col = localised_col.dt.tz_convert("UTC") if PANDAS_GE_20: - ser_utc = ser_utc.dt.as_unit("ms") + utc_col = utc_col.dt.as_unit("ms") df = gp.GeoDataFrame( - {"dates": ser_localised, "geometry": [Point(1, 1), Point(1, 1)]}, + {"dates": localised_col, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326", ) fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath) result = read_dataframe(fpath) - # GDAL tz only encodes offsets, not timezones, for multiple offsets - # read as utc datetime as otherwise would be read as string - assert_series_equal(result["dates"], ser_utc) + # GDAL tz only encodes offsets, not timezones + # check multiple offsets are read as utc datetime instead of string values + assert_series_equal(result["dates"], utc_col) def test_read_write_datetime_tz_with_nulls(tmp_path): diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index a53d9d8c..acbeebcf 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -775,7 +775,7 @@ def test_read_datetime_millisecond(test_datetime): assert field[1] == np.datetime64("2020-01-01 10:00:00.000") -def test_read_datetime_tz(test_datetime_tz): +def test_read_datetime_as_string(test_datetime_tz): field = read(test_datetime_tz)[3][0] assert field.dtype == "datetime64[ms]" # timezone is ignored in numpy layer From 086e52e4bad829207a55ac1e3ee4547626122e9c Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 7 Oct 2023 16:49:29 +1100 Subject: [PATCH 44/51] update known issues --- docs/source/known_issues.md | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/docs/source/known_issues.md b/docs/source/known_issues.md index 3d84215e..1b0b463c 100644 --- a/docs/source/known_issues.md +++ b/docs/source/known_issues.md @@ -55,16 +55,23 @@ with obscure error messages. ## Support for reading and writing DateTimes GDAL only supports datetimes at a millisecond resolution. Reading data will thus -give at most millisecond resolution (`datetime64[ms]` data type), even though -the data is cast `datetime64[ns]` data type when reading into a data frame -using `pyogrio.read_dataframe()`. When writing, only precision up to ms is retained. +give at most millisecond resolution (`datetime64[ms]` data type). With pandas 2.0 +`pyogrio.read_dataframe()` will return datetime data as `datetime64[ms]` +correspondingly. For previous versions of pandas, `datetime64[ns]` is used as +ms precision was not supported. When writing, only precision up to +ms is retained. Not all file formats have dedicated support to store datetime data, like ESRI Shapefile. For such formats, or if you require precision > ms, a workaround is to convert the datetimes to string. -Timezone information is ignored at the moment, both when reading and when writing -datetime columns. +Timezone information is preserved where possible, however GDAL only represents +time zones as UTC offsets, whilst pandas uses IANA time zones (via `pytz` or +`zoneinfo`). This means that dataframes with columns containing multiple offsets +e.g. when switching from standard time to summer time will be written correctly, +but when read via `pyogrio.read_dataframe()` will be returned in UTC time, as +there is no way to reconstruct the original timezone from the individual offsets +present. ## Support for OpenStreetMap (OSM) data From 2b2dd5fba7a996762b569891b6d28e059b75300f Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 7 Oct 2023 16:50:52 +1100 Subject: [PATCH 45/51] reword --- docs/source/known_issues.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/known_issues.md b/docs/source/known_issues.md index 1b0b463c..8f351f4e 100644 --- a/docs/source/known_issues.md +++ b/docs/source/known_issues.md @@ -68,10 +68,10 @@ convert the datetimes to string. Timezone information is preserved where possible, however GDAL only represents time zones as UTC offsets, whilst pandas uses IANA time zones (via `pytz` or `zoneinfo`). This means that dataframes with columns containing multiple offsets -e.g. when switching from standard time to summer time will be written correctly, -but when read via `pyogrio.read_dataframe()` will be returned in UTC time, as -there is no way to reconstruct the original timezone from the individual offsets -present. +(e.g. when switching from standard time to summer time) will be written correctly, +but when read via `pyogrio.read_dataframe()` will be returned as a UTC datetime +column, as there is no way to reconstruct the original timezone from the individual +offsets present. ## Support for OpenStreetMap (OSM) data From 2167d0fc6ceadc5c4f9aaba06df19276c5445052 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Tue, 17 Oct 2023 21:45:45 +1100 Subject: [PATCH 46/51] move documentation --- docs/source/introduction.md | 21 +++++++++++++++++++++ docs/source/known_issues.md | 21 --------------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/docs/source/introduction.md b/docs/source/introduction.md index 3fd0c324..e7ac8cf1 100644 --- a/docs/source/introduction.md +++ b/docs/source/introduction.md @@ -464,6 +464,27 @@ You can also read from a URL with this syntax: >>> read_dataframe("zip+https://s3.amazonaws.com/bucket/shapefile.zip") ``` +## Reading and writing DateTimes + +GDAL only supports datetimes at a millisecond resolution. Reading data will thus +give at most millisecond resolution (`datetime64[ms]` data type). With pandas 2.0 +`pyogrio.read_dataframe()` will return datetime data as `datetime64[ms]` +correspondingly. For previous versions of pandas, `datetime64[ns]` is used as +ms precision was not supported. When writing, only precision up to +ms is retained. + +Not all file formats have dedicated support to store datetime data, like ESRI +Shapefile. For such formats, or if you require precision > ms, a workaround is to +convert the datetimes to string. + +Timezone information is preserved where possible, however GDAL only represents +time zones as UTC offsets, whilst pandas uses IANA time zones (via `pytz` or +`zoneinfo`). This means that dataframes with columns containing multiple offsets +(e.g. when switching from standard time to summer time) will be written correctly, +but when read via `pyogrio.read_dataframe()` will be returned as a UTC datetime +column, as there is no way to reconstruct the original timezone from the individual +offsets present. + ## Dataset and layer creation options It is possible to use dataset and layer creation options available for a given diff --git a/docs/source/known_issues.md b/docs/source/known_issues.md index 8f351f4e..29d390ee 100644 --- a/docs/source/known_issues.md +++ b/docs/source/known_issues.md @@ -52,27 +52,6 @@ Pyogrio does not currently validate attribute values or geometry types before attempting to write to the output file. Invalid types may crash during writing with obscure error messages. -## Support for reading and writing DateTimes - -GDAL only supports datetimes at a millisecond resolution. Reading data will thus -give at most millisecond resolution (`datetime64[ms]` data type). With pandas 2.0 -`pyogrio.read_dataframe()` will return datetime data as `datetime64[ms]` -correspondingly. For previous versions of pandas, `datetime64[ns]` is used as -ms precision was not supported. When writing, only precision up to -ms is retained. - -Not all file formats have dedicated support to store datetime data, like ESRI -Shapefile. For such formats, or if you require precision > ms, a workaround is to -convert the datetimes to string. - -Timezone information is preserved where possible, however GDAL only represents -time zones as UTC offsets, whilst pandas uses IANA time zones (via `pytz` or -`zoneinfo`). This means that dataframes with columns containing multiple offsets -(e.g. when switching from standard time to summer time) will be written correctly, -but when read via `pyogrio.read_dataframe()` will be returned as a UTC datetime -column, as there is no way to reconstruct the original timezone from the individual -offsets present. - ## Support for OpenStreetMap (OSM) data OpenStreetMap data do not natively support calculating the feature count by data From ab0fbf6ec3ba4bceb141a4732910a6f6f09e0c16 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Tue, 17 Oct 2023 21:46:50 +1100 Subject: [PATCH 47/51] rename field as suggested --- pyogrio/_io.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 747873cc..8376bda7 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1665,7 +1665,7 @@ def ogr_write( str crs, str geometry_type, str encoding, object dataset_kwargs, object layer_kwargs, bint promote_to_multi=False, bint nan_as_null=True, bint append=False, dataset_metadata=None, layer_metadata=None, - timezone_cols_metadata=None + gdal_tz_offsets=None ): cdef const char *path_c = NULL cdef const char *layer_c = NULL @@ -1736,8 +1736,8 @@ def ogr_write( if not layer: layer = os.path.splitext(os.path.split(path)[1])[0] - if timezone_cols_metadata is None: - timezone_cols_metadata = {} + if gdal_tz_offsets is None: + gdal_tz_offsets = {} # if shapefile, GeoJSON, or FlatGeobuf, always delete first @@ -2012,7 +2012,7 @@ def ogr_write( OGR_F_SetFieldNull(ogr_feature, field_idx) else: datetime = field_value.astype("datetime64[ms]").item() - tz_array = timezone_cols_metadata.get(fields[field_idx], None) + tz_array = gdal_tz_offsets.get(fields[field_idx], None) if tz_array is None: gdal_tz = 0 else: From 0f02115ce02b8eb46e2b04830029e3a9dc220813 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Wed, 18 Oct 2023 08:54:14 +1100 Subject: [PATCH 48/51] final missing gdal tz offset change --- pyogrio/geopandas.py | 2 +- pyogrio/raw.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 9c94d099..fcab92ac 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -561,6 +561,6 @@ def write_dataframe( metadata=metadata, dataset_options=dataset_options, layer_options=layer_options, - timezone_cols_metadata=gdal_tz_offsets, + gdal_tz_offsets=gdal_tz_offsets, **kwargs, ) diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 3aed010d..4384ee9a 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -456,7 +456,7 @@ def write( metadata=None, dataset_options=None, layer_options=None, - timezone_cols_metadata=None, + gdal_tz_offsets=None, **kwargs, ): # if dtypes is given, remove it from kwargs (dtypes is included in meta returned by @@ -545,5 +545,5 @@ def write( layer_metadata=layer_metadata, dataset_kwargs=dataset_kwargs, layer_kwargs=layer_kwargs, - timezone_cols_metadata=timezone_cols_metadata, + gdal_tz_offsets=gdal_tz_offsets, ) From 52a922d5827996822d8d7583395f502317693ca4 Mon Sep 17 00:00:00 2001 From: Matt Richards <45483497+m-richards@users.noreply.github.com> Date: Wed, 18 Oct 2023 08:55:19 +1100 Subject: [PATCH 49/51] Update pyogrio/tests/test_geopandas_io.py Co-authored-by: Joris Van den Bossche --- pyogrio/tests/test_geopandas_io.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 96981999..1c7ddfc5 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -174,8 +174,10 @@ def test_read_layer_invalid(naturalearth_lowres_all_ext, use_arrow): @pytest.mark.filterwarnings("ignore: Measured") -def test_read_datetime(test_fgdb_vsi): - df = read_dataframe(test_fgdb_vsi, layer="test_lines", max_features=1) +def test_read_datetime(test_fgdb_vsi, use_arrow): + df = read_dataframe( + test_fgdb_vsi, layer="test_lines", use_arrow=use_arrow, max_features=1 + ) if PANDAS_GE_20: # starting with pandas 2.0, it preserves the passed datetime resolution assert df.SURVEY_DAT.dtype.name == "datetime64[ms]" From 7c99e51f397cd3e2e210908eb8f0eda8db1a4fd5 Mon Sep 17 00:00:00 2001 From: Matt Richards <45483497+m-richards@users.noreply.github.com> Date: Wed, 18 Oct 2023 08:59:17 +1100 Subject: [PATCH 50/51] Apply suggestions from code review Co-authored-by: Joris Van den Bossche --- pyogrio/geopandas.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index fcab92ac..a0f30f3a 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -435,11 +435,10 @@ def write_dataframe( # TODO: may need to fill in pd.NA, etc field_data = [] field_mask = [] - # dict[str, np.array(datetime.datetime)] special case for dt-tz fields + # dict[str, np.array(int)] special case for dt-tz fields gdal_tz_offsets = {} for name in fields: col = df[name] - values = col.values if isinstance(col.dtype, pd.DatetimeTZDtype): # Deal with datetimes with timezones by passing down timezone separately # pass down naive datetime From a5f5f9db089f34b6bc3549fac48b1af4184eaf8e Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Thu, 19 Oct 2023 19:56:12 -0700 Subject: [PATCH 51/51] add changelog entry --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 664172d8..5b5c5caf 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,7 @@ ### Improvements +- Support reading and writing datetimes with timezones (#253). - Support writing dataframes without geometry column (#267). - Calculate feature count by iterating over features if GDAL returns an unknown count for a data layer (e.g., OSM driver); this may have signficant