From a43c8ed29f903d2ba7010c56daaa6c609019a772 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Fri, 8 Sep 2023 23:09:32 +0100 Subject: [PATCH 01/41] Avoid pandas 2.1.0 due to timestamp bug Pandas 2.1.0 DataFrame constructor bug causeing timestamps to have inconsistent units (https://github.com/pandas-dev/pandas/issues/55014). --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index ad41e42d..beaef34a 100644 --- a/environment.yml +++ b/environment.yml @@ -9,7 +9,7 @@ dependencies: # Currently dask and numpy==1.16.0 clash # TODO: add support for numpy>=1.23 - numpy!=1.15.0,!=1.16.0 - - pandas>=0.23.0,!=1.0.0 + - pandas>=0.23.0,!=1.0.0,!=2.1.0 - pyarrow>=0.17.1,!=1.0.0 - simplejson - minimalkv>=1.4.2 From f2ea716ffaefecee367c2390619ced82b0c07bda Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Thu, 14 Sep 2023 17:05:48 +0100 Subject: [PATCH 02/41] Coerce timestamps to nanoseconds when converting to pandas --- plateau/core/common_metadata.py | 17 +++++++++++++++-- plateau/core/index.py | 17 +++++++++++++---- plateau/serialization/_csv.py | 2 +- plateau/serialization/_parquet.py | 9 +++++++-- 4 files changed, 36 insertions(+), 9 deletions(-) diff --git a/plateau/core/common_metadata.py b/plateau/core/common_metadata.py index 31983786..9dcc3f54 100644 --- a/plateau/core/common_metadata.py +++ b/plateau/core/common_metadata.py @@ -736,7 +736,9 @@ def _dict_to_binary(dct): return simplejson.dumps(dct, sort_keys=True).encode("utf8") -def empty_dataframe_from_schema(schema, columns=None, date_as_object=False): +def empty_dataframe_from_schema( + schema, columns=None, date_as_object=False, coerce_temporal_nanoseconds=True +): """Create an empty DataFrame from provided schema. Parameters @@ -746,6 +748,10 @@ def empty_dataframe_from_schema(schema, columns=None, date_as_object=False): columns: Union[None, List[str]] Optional list of columns that should be part of the resulting DataFrame. All columns in that list must also be part of the provided schema. + date_as_object: bool + Cast dates to objects. + coerce_temporal_nanoseconds: bool + Coerce date32, date64, duration and timestamp units to nanoseconds to retain behaviour of pandas 1.x. Returns ------- @@ -753,7 +759,14 @@ def empty_dataframe_from_schema(schema, columns=None, date_as_object=False): Empty DataFrame with requested columns and types. """ - df = schema.internal().empty_table().to_pandas(date_as_object=date_as_object) + df = ( + schema.internal() + .empty_table() + .to_pandas( + date_as_object=date_as_object, + coerce_temporal_nanoseconds=coerce_temporal_nanoseconds, + ) + ) df.columns = df.columns.map(ensure_string_type) if columns is not None: diff --git a/plateau/core/index.py b/plateau/core/index.py index 6a8631fc..3cfaa759 100644 --- a/plateau/core/index.py +++ b/plateau/core/index.py @@ -136,11 +136,18 @@ def __repr__(self) -> str: class_=type(self).__name__, attrs=", ".join(repr_str) ) - def observed_values(self, date_as_object=True) -> np.ndarray: + def observed_values( + self, date_as_object=True, coerce_temporal_nanoseconds=True + ) -> np.ndarray: """Return an array of all observed values.""" keys = np.array(list(self.index_dct.keys())) labeled_array = pa.array(keys, type=self.dtype) - return np.array(labeled_array.to_pandas(date_as_object=date_as_object)) + return np.array( + labeled_array.to_pandas( + date_as_object=date_as_object, + coerce_temporal_nanoseconds=coerce_temporal_nanoseconds, + ) + ) @staticmethod def normalize_value(dtype: pa.DataType, value: Any) -> Any: @@ -476,7 +483,9 @@ def as_flat_series( table = _index_dct_to_table( self.index_dct, column=self.column, dtype=self.dtype ) - df = table.to_pandas(date_as_object=date_as_object) + df = table.to_pandas( + date_as_object=date_as_object, coerce_temporal_nanoseconds=True + ) if predicates is not None: # If there is a conjunction without any reference to the index @@ -862,7 +871,7 @@ def _parquet_bytes_to_dict(column: str, index_buffer: bytes): if column_type == pa.timestamp("us"): column_type = pa.timestamp("ns") - df = table.to_pandas() + df = table.to_pandas(coerce_temporal_nanoseconds=True) index_dct = dict( zip(df[column].values, (list(x) for x in df[_PARTITION_COLUMN_NAME].values)) diff --git a/plateau/serialization/_csv.py b/plateau/serialization/_csv.py index 5560d4b9..0cab490e 100644 --- a/plateau/serialization/_csv.py +++ b/plateau/serialization/_csv.py @@ -85,7 +85,7 @@ def restore_dataframe( def store(self, store, key_prefix, df): if isinstance(df, pa.Table): - df = df.to_pandas() + df = df.to_pandas(coerce_temporal_nanoseconds=True) key = f"{key_prefix}.csv" result_stream = BytesIO() iostream: BufferedIOBase diff --git a/plateau/serialization/_parquet.py b/plateau/serialization/_parquet.py index 953571e3..c557a381 100644 --- a/plateau/serialization/_parquet.py +++ b/plateau/serialization/_parquet.py @@ -259,7 +259,10 @@ def _restore_dataframe( df = ( parquet_file.schema.to_arrow_schema() .empty_table() - .to_pandas(date_as_object=date_as_object) + .to_pandas( + date_as_object=date_as_object, + coerce_temporal_nanoseconds=True, + ) ) index = pd.Index( pd.RangeIndex(start=0, stop=parquet_file.metadata.num_rows), @@ -284,7 +287,9 @@ def _restore_dataframe( table = _reset_dictionary_columns(table, exclude=categories) - df = table.to_pandas(date_as_object=date_as_object) + df = table.to_pandas( + date_as_object=date_as_object, coerce_temporal_nanoseconds=True + ) # XXX: Patch until Pyarrow bug is resolved: https://issues.apache.org/jira/browse/ARROW-18099?filter=-2 if categories: From fa15a47ed828304f66951a26e5db53d784aba37a Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Mon, 18 Sep 2023 13:10:21 +0100 Subject: [PATCH 03/41] Prevent dask from converting objects to strings --- plateau/io/dask/compression.py | 11 +- plateau/io/dask/dataframe.py | 149 +++++++++++--------- tests/io/dask/dataframe/test_compression.py | 11 +- tests/io/dask/dataframe/test_update.py | 15 +- 4 files changed, 102 insertions(+), 84 deletions(-) diff --git a/plateau/io/dask/compression.py b/plateau/io/dask/compression.py index e179ea41..eab81b05 100644 --- a/plateau/io/dask/compression.py +++ b/plateau/io/dask/compression.py @@ -2,6 +2,7 @@ from functools import partial from typing import List, Union +import dask import dask.dataframe as dd import pandas as pd @@ -109,7 +110,8 @@ def pack_payload(df: dd.DataFrame, group_key: Union[List[str], str]) -> dd.DataF _pack_payload = partial(pack_payload_pandas, group_key=group_key) - return df.map_partitions(_pack_payload, meta=packed_meta) + with dask.config.set({"dataframe.convert-string": False}): + return df.map_partitions(_pack_payload, meta=packed_meta) def unpack_payload_pandas( @@ -154,6 +156,7 @@ def unpack_payload(df: dd.DataFrame, unpack_meta: pd.DataFrame) -> dd.DataFrame: ) return df - return df.map_partitions( - unpack_payload_pandas, unpack_meta=unpack_meta, meta=unpack_meta - ) + with dask.config.set({"dataframe.convert-string": False}): + return df.map_partitions( + unpack_payload_pandas, unpack_meta=unpack_meta, meta=unpack_meta + ) diff --git a/plateau/io/dask/dataframe.py b/plateau/io/dask/dataframe.py index 36fda4be..e0aea8ef 100644 --- a/plateau/io/dask/dataframe.py +++ b/plateau/io/dask/dataframe.py @@ -150,18 +150,20 @@ def read_dataset_as_ddf( divisions.sort() divisions_lst = list(divisions) divisions_lst.append(divisions[-1]) - ddf = from_map( - ReadPlateauPartition(columns=columns), - mps, - meta=meta, - label="read-plateau", - divisions=divisions_lst, - store=ds_factory.store_factory, - categoricals=categoricals, - predicate_pushdown_to_io=predicate_pushdown_to_io, - dates_as_object=dates_as_object, - predicates=predicates, - ) + + with dask.config.set({"dataframe.convert-string": False}): + ddf = from_map( + ReadPlateauPartition(columns=columns), + mps, + meta=meta, + label="read-plateau", + divisions=divisions_lst, + store=ds_factory.store_factory, + categoricals=categoricals, + predicate_pushdown_to_io=predicate_pushdown_to_io, + dates_as_object=dates_as_object, + predicates=predicates, + ) if dask_index_on: return ddf.set_index(dask_index_on, divisions=divisions_lst, sorted=True) else: @@ -329,21 +331,24 @@ def store_dataset_from_ddf( if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) - mp_ser = _write_dataframe_partitions( - ddf=ddf, - store=ds_factory.store_factory, - dataset_uuid=dataset_uuid, - table=table, - secondary_indices=secondary_indices, - shuffle=shuffle, - repartition_ratio=repartition_ratio, - num_buckets=num_buckets, - sort_partitions_by=sort_partitions_by, - df_serializer=df_serializer, - metadata_version=metadata_version, - partition_on=partition_on, - bucket_by=bucket_by, - ) + + with dask.config.set({"dataframe.convert-string": False}): + mp_ser = _write_dataframe_partitions( + ddf=ddf, + store=ds_factory.store_factory, + dataset_uuid=dataset_uuid, + table=table, + secondary_indices=secondary_indices, + shuffle=shuffle, + repartition_ratio=repartition_ratio, + num_buckets=num_buckets, + sort_partitions_by=sort_partitions_by, + df_serializer=df_serializer, + metadata_version=metadata_version, + partition_on=partition_on, + bucket_by=bucket_by, + ) + return mp_ser.reduction( chunk=_id, aggregate=_commit_store_from_reduction, @@ -471,21 +476,22 @@ def update_dataset_from_ddf( inferred_indices = _ensure_compatible_indices(ds_factory, secondary_indices) del secondary_indices - mp_ser = _write_dataframe_partitions( - ddf=ddf, - store=ds_factory.store_factory if ds_factory else store, - dataset_uuid=dataset_uuid or ds_factory.dataset_uuid, - table=table, - secondary_indices=inferred_indices, - shuffle=shuffle, - repartition_ratio=repartition_ratio, - num_buckets=num_buckets, - sort_partitions_by=sort_partitions_by, - df_serializer=df_serializer, - metadata_version=metadata_version, - partition_on=cast(List[str], partition_on), - bucket_by=bucket_by, - ) + with dask.config.set({"dataframe.convert-string": False}): + mp_ser = _write_dataframe_partitions( + ddf=ddf, + store=ds_factory.store_factory if ds_factory else store, + dataset_uuid=dataset_uuid or ds_factory.dataset_uuid, + table=table, + secondary_indices=inferred_indices, + shuffle=shuffle, + repartition_ratio=repartition_ratio, + num_buckets=num_buckets, + sort_partitions_by=sort_partitions_by, + df_serializer=df_serializer, + metadata_version=metadata_version, + partition_on=cast(List[str], partition_on), + bucket_by=bucket_by, + ) return mp_ser.reduction( chunk=_id, @@ -567,24 +573,26 @@ def collect_dataset_metadata( mps = list( dispatch_metapartitions_from_factory(dataset_factory, predicates=predicates) ) - if mps: - random.shuffle(mps) - # ensure that even with sampling at least one metapartition is returned - cutoff_index = max(1, int(len(mps) * frac)) - mps = mps[:cutoff_index] - ddf = dd.from_delayed( - [ - dask.delayed(MetaPartition.get_parquet_metadata)( - mp, store=dataset_factory.store_factory - ) - for mp in mps - ], - meta=_METADATA_SCHEMA, - ) - else: - df = pd.DataFrame(columns=_METADATA_SCHEMA.keys()) - df = df.astype(_METADATA_SCHEMA) - ddf = dd.from_pandas(df, npartitions=1) + + with dask.config.set({"dataframe.convert-string": False}): + if mps: + random.shuffle(mps) + # ensure that even with sampling at least one metapartition is returned + cutoff_index = max(1, int(len(mps) * frac)) + mps = mps[:cutoff_index] + ddf = dd.from_delayed( + [ + dask.delayed(MetaPartition.get_parquet_metadata)( + mp, store=dataset_factory.store_factory + ) + for mp in mps + ], + meta=_METADATA_SCHEMA, + ) + else: + df = pd.DataFrame(columns=_METADATA_SCHEMA.keys()) + df = df.astype(_METADATA_SCHEMA) + ddf = dd.from_pandas(df, npartitions=1) return ddf @@ -651,12 +659,15 @@ def hash_dataset( columns=columns, dates_as_object=True, ) - if not group_key: - return ddf.map_partitions(_hash_partition, meta="uint64").astype("uint64") - else: - ddf2 = pack_payload(ddf, group_key=group_key) - return ( - ddf2.groupby(group_key) - .apply(_unpack_hash, unpack_meta=ddf._meta, subset=subset, meta="uint64") - .astype("uint64") - ) + with dask.config.set({"dataframe.convert-string": False}): + if not group_key: + return ddf.map_partitions(_hash_partition, meta="uint64").astype("uint64") + else: + ddf2 = pack_payload(ddf, group_key=group_key) + return ( + ddf2.groupby(group_key) + .apply( + _unpack_hash, unpack_meta=ddf._meta, subset=subset, meta="uint64" + ) + .astype("uint64") + ) diff --git a/tests/io/dask/dataframe/test_compression.py b/tests/io/dask/dataframe/test_compression.py index c18fcb98..2ca358cd 100644 --- a/tests/io/dask/dataframe/test_compression.py +++ b/tests/io/dask/dataframe/test_compression.py @@ -1,3 +1,4 @@ +import dask import dask.dataframe as dd import pandas as pd import pandas.testing as pdt @@ -13,9 +14,10 @@ def test_pack_payload(df_all_types): # For a single row dataframe the packing actually has a few more bytes - df = dd.from_pandas( - pd.concat([df_all_types] * 10, ignore_index=True), npartitions=3 - ) + with dask.config.set({"dataframe.convert-string": False}): + df = dd.from_pandas( + pd.concat([df_all_types] * 10, ignore_index=True), npartitions=3 + ) size_before = df.memory_usage(deep=True).sum() packed_df = pack_payload(df, group_key=list(df.columns[-2:])) @@ -66,7 +68,8 @@ def test_pack_payload_pandas_empty(df_all_types): @pytest.mark.parametrize("num_group_cols", [1, 4]) def test_pack_payload_roundtrip(df_all_types, num_group_cols): group_key = list(df_all_types.columns[-num_group_cols:]) - df_all_types = dd.from_pandas(df_all_types, npartitions=2) + with dask.config.set({"dataframe.convert-string": False}): + df_all_types = dd.from_pandas(df_all_types, npartitions=2) pdt.assert_frame_equal( df_all_types.compute(), unpack_payload( diff --git a/tests/io/dask/dataframe/test_update.py b/tests/io/dask/dataframe/test_update.py index 3bb7c827..fe0c190f 100644 --- a/tests/io/dask/dataframe/test_update.py +++ b/tests/io/dask/dataframe/test_update.py @@ -24,13 +24,14 @@ def _id(part): def _update_dataset(partitions, *args, **kwargs): # TODO: Simplify once parse_input_to_metapartition is removed / obsolete - if isinstance(partitions, pd.DataFrame): - partitions = dd.from_pandas(partitions, npartitions=1) - elif partitions is not None: - delayed_partitions = [dask.delayed(_id)(part) for part in partitions] - partitions = dd.from_delayed(delayed_partitions) - else: - partitions = None + with dask.config.set({"dataframe.convert-string": False}): + if isinstance(partitions, pd.DataFrame): + partitions = dd.from_pandas(partitions, npartitions=1) + elif partitions is not None: + delayed_partitions = [dask.delayed(_id)(part) for part in partitions] + partitions = dd.from_delayed(delayed_partitions) + else: + partitions = None # Replace `table_name` with `table` keyword argument to enable shared test code # via `bound_update_dataset` fixture From c182ffbf6f2cb712bab5f5922f3921fb7ed7386d Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Mon, 18 Sep 2023 13:16:40 +0100 Subject: [PATCH 04/41] Avoid dask 2023.9.2 due to failing tests --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index beaef34a..5ad2f3c8 100644 --- a/environment.yml +++ b/environment.yml @@ -3,7 +3,7 @@ channels: - conda-forge - nodefaults dependencies: - - dask!=2021.5.1,!=2021.6.0 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions + - dask!=2021.5.1,!=2021.6.0, !=2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions - decorator - msgpack-python>=0.5.2 # Currently dask and numpy==1.16.0 clash From 8b1e0af898a55bcf25fe5c8c494f85db56cac2c8 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Mon, 18 Sep 2023 18:41:29 +0100 Subject: [PATCH 05/41] Cast metadata bytes to object to get around pandas bug --- plateau/core/common_metadata.py | 13 ++++++------- plateau/serialization/_parquet.py | 5 ++++- plateau/serialization/_util.py | 8 ++++++++ 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/plateau/core/common_metadata.py b/plateau/core/common_metadata.py index 9dcc3f54..9cc6224d 100644 --- a/plateau/core/common_metadata.py +++ b/plateau/core/common_metadata.py @@ -16,6 +16,7 @@ from plateau.core.naming import SINGLE_TABLE from plateau.core.utils import ensure_string_type from plateau.serialization._parquet import PARQUET_VERSION +from plateau.serialization._util import schema_metadata_bytes_to_object _logger = logging.getLogger() @@ -758,14 +759,12 @@ def empty_dataframe_from_schema( DataFrame Empty DataFrame with requested columns and types. """ + # HACK: Cast bytes to object in metadata until Pandas bug is fixed: https://github.com/pandas-dev/pandas/issues/50127 + schema = schema_metadata_bytes_to_object(schema.internal()) - df = ( - schema.internal() - .empty_table() - .to_pandas( - date_as_object=date_as_object, - coerce_temporal_nanoseconds=coerce_temporal_nanoseconds, - ) + df = schema.empty_table().to_pandas( + date_as_object=date_as_object, + coerce_temporal_nanoseconds=coerce_temporal_nanoseconds, ) df.columns = df.columns.map(ensure_string_type) diff --git a/plateau/serialization/_parquet.py b/plateau/serialization/_parquet.py index c557a381..553547dd 100644 --- a/plateau/serialization/_parquet.py +++ b/plateau/serialization/_parquet.py @@ -23,7 +23,7 @@ filter_df_from_predicates, ) from ._io_buffer import BlockBuffer -from ._util import ensure_unicode_string_type +from ._util import ensure_unicode_string_type, schema_metadata_bytes_to_object try: # Only check for BotoStore instance if boto is really installed @@ -287,6 +287,9 @@ def _restore_dataframe( table = _reset_dictionary_columns(table, exclude=categories) + # HACK: Cast bytes to object in metadata until Pandas bug is fixed: https://github.com/pandas-dev/pandas/issues/50127 + table = table.cast(schema_metadata_bytes_to_object(table.schema)) + df = table.to_pandas( date_as_object=date_as_object, coerce_temporal_nanoseconds=True ) diff --git a/plateau/serialization/_util.py b/plateau/serialization/_util.py index f556c981..223eca6d 100644 --- a/plateau/serialization/_util.py +++ b/plateau/serialization/_util.py @@ -1,3 +1,6 @@ +from pyarrow import Schema + + def _check_contains_null(val): if isinstance(val, bytes): for byte in val: @@ -16,3 +19,8 @@ def ensure_unicode_string_type(obj): return obj.decode("utf8") else: return str(obj) + + +def schema_metadata_bytes_to_object(schema: Schema) -> Schema: + meta = schema.metadata[b"pandas"].decode().replace("bytes", "object").encode() + return schema.with_metadata({b"pandas": meta}) From d35b3e044b7dbbd14da0f78d513965b524195959 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Tue, 19 Sep 2023 09:44:30 +0100 Subject: [PATCH 06/41] Generate arrow-compat reference data for 13.0.0 --- reference-data/arrow-compat/13.0.0.parquet | Bin 0 -> 18661 bytes tests/serialization/test_arrow_compat.py | 1 + 2 files changed, 1 insertion(+) create mode 100644 reference-data/arrow-compat/13.0.0.parquet diff --git a/reference-data/arrow-compat/13.0.0.parquet b/reference-data/arrow-compat/13.0.0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2c97acf946bb59bb072778a371a1ee64131ff9ce GIT binary patch literal 18661 zcmc&+Z){uD6@LyP4P6+a)W(5~MPa#v^^gB0ZqgOTvz<6j?KDYK+c83wpY1qKZ70M| z90w{@h)sOp1KJ0Krin=iO&hEDG@)ujsDo)jXqwQ3_%KZqAErs0(1a$0CWO$=x%d5l z_t{Ppmy|m1-E+@5zkANP=bn44vLG(`oW0I|=Rn-q;}nhwLhpbeT>Ftza6aT5Jqcrj z&OT?~u)7yW*{4f*ME52L!ab8c@Yw@@;PVbgk3g#7Aj$2pq2Z!rV4)XdPxb@$>YdPIBUuH(Za!_FRg9p7ReDOKyfpmi0@A0PFR>*#h(tVzjjLH+ho z9%}n2x_K9Q<*t3BHpCGw7=JAG?pG*o^$8tn9R+4<9DR8Au(Q`<)=6xE_0S0v2Cf=WSxL$OS4~~)7 zjt!e6-$|L(9;7#iynhTOf<#zvVQy9=&m^7%vOfJe_Z1T*1}e84{9JgF@G zczh6KKKJqO`!fCa{_DquIpN6vjt;`qV@?->U}XXz9yHH!TSh?8T_Z9$C>DM)s+tGz zHR}b6?|p~sLkIbKguFF^hOh|Xh~*f`RN6cnp^4pq+1-ny>=VqVdo$tu_|zx?_)**nqvMtuDW1u6-74&YRuwNVq^x4y6L zT=!ALTQ5}#!EtKc8xGe^2l*u-UyNa?B8%0YY!q-;VaV5V@n~Fuu3SV5Yo(+K@y~vs zmW}Qsa=}R9G9$n7D0%KtFtWyPlE#H+G%{$Et3)G2PCZC%AkPQO$WQP~AH3I2sr8`@ z*5`p;oH|~M!0|XXfb>zJrH&F{m*Ig$J2*$T;CeFSNukB0lyC1uYBa1 zZ{Oey&R1$F>kd;WTD^tNw~m^e|K+&0!_b-r*Z(^}ei!IU1$S%K{pgHl-CjHzb;kWi zkh~q-w{hW|h=VQh)`(e4K91N)07LQtBnae#DV^%-c3C-rlH{jTmtn2)I3ky16hqzhEO{eLz78P-g_mc7p|AWCg1fJlTPswu&Y&>S<2Lp? zpfu`>pM&t7=ylU-4|^E>V?A!A2V!PHmvC5)nppIAcrHVaj$v#he*M0|dl5yCTjRuV z9aq0Igq!rn_JGg6&z|xGGFOpi{@%!Lr640M-#JYX=J) z8wS=VWLEu6qdi_1E>-4~qeKlljvoeeW*rma*x_;tFh*;5!HHyAE<+l|N38J12I6I$ zh91$Cd}GJsFCH5{2^65w;FF6h!Aawaz6HmbAXXM=Aw#9(V8G7=vy)ENiU*12Lxu!1 zeEDtEg}~(WJEnR|8x8n1M^MaiQqpY9l+)xerkal~UD>a)Wn4W&!$V&^csn|e|a>s$(mW}{9 z{98Qf{wdLdWG@k*hdh3Yp8)1MhFRH3n6suq#=((=ry$HY{G`$Wj`sG(oc1W5bbyMBmk9HtydBpZ;64CZ7~GL4~PM3d0)^DflB1* zr^JJ65BOVXXG9zA0sq0Z+gl%&XK#B-Jh=AB7E7lzo0U%jGHTMO z3|9>YfikK)1}sSyzgFa^kw?vHV7P{}je58Tjztr;VXT^zQIjTR_*NCMZ}j-;G#W=K z60IvlIE?MdYK^+Il}+V?&#GNVFsas2j7!!L+N$L3Rm#uZraB5oC1Yu&61E@jP*jIa zD^$mg%GBXDs(6VIc)1x+9k-}r)7z-xnGr;Cji`=WRCQ!PPI3c@ysV}STlCP$tyY8^ z(He@%sL4x(%4WT3jsMPeI3`_QNND3GZ5qM2EN#5i!f+NZEVOZsI>umC*FG{(EH5;) zagI91U{%*XI#4VpJk(*1GCJL=t8IknZW6{aUq>*j(+)qQLis;KyTpSv`U{Z&a-#;q*5q7CW;La z_~17R2{$4@gLlvngZZH|u}XP66HSh%{qsB0Y_cv@6Qv8KkW^^SZDmsvS7PfM#aMN2 zO{(O}u~MTLt4yp(sY^I#JD=LE#lvDL7g$=$RTp7QY{>6Bxo|8Pi%i#I;kmthG}%Bo ziL@_|^cDY9EuZpjr~OQKCavn+UP<}NI9IABYf>2XWi~UZ-79H7M{jfnu+QwJ14|RR zC}I*z`P9TF$iI@g7(?BF(bp&>r@V!9h{}ysrb^jLay=iW^m-<}v<5P~D;ELpY^{_@ zO{`1)rA?`lqw@K-M|ZR=}j!JPz;$$xhzGOR;9{J9ef@HpF=Fpt%1)s zrRw7DJo7RLze4bCScw)KIoV&CfVVGA9L4JiC>eJ_`-A%}=k}|J* zSQE|8u*&n=V(m_hGCQnzJHsl^Ym2ozamui=7;Lehe7NuAc(&h9fn*nB)o@nY=lwu( zJDZx{$-o)g=A80Ll`@{61c-8qMJxhAvb8LVmK8i*HLI$o#^4hc&>TCQi7?e{3fl!n?Wb*a@k zLa|XJ&1Lj#yvMjAr{7K<0r;j>Buf% z+ufSo9%J`AdOzaruQKE$Z^3r|x46UE>@B5^vcqGb^K4V4=dg00u*Qv%-ccMgol2}( zbGU(Sk4cGjJ4_0FADHZOIK$K;o|zq#{a}Ylq3;8eZA`bTqx5t;Jo|}jwZ^fJB|}}s zi*J(*>%MPa56Hcgz+4&XfSp{4^)H}bP=ek<6ZQ_?g&Tzw_8O)FY|m%Px$69;85GQ+WEd|lj5H0P$EYZS%Sk&@%#QNs5raYYp8 zDD4XLtf1p`Nfc)q(7TF^W3TB7Hl)H0=v75xKoqlL0!E7wc+X*5s)p^Um>7nBR07{; z@%=&rSi|GUx`u788QQkW;rk>iMG;~c-bB<6qVNMJLi=l>>hWOvYw;4LQ_!p_hGa2> zv{|*L6(yYSU!U-JQ1Wz`3c-cQ^6){#HHje>OZXb!Is@|os3?|IeKEd16g5lphaa1C z6FWN|j~6*3;)zv{2OdJONc^Eoq5V1f%IvM=0T{aIS@3wM(hGZS@S-S&7kVY2>dYkI zW%w4hWcy6>@uF^;M`;gHT*CR@NU*hiP*jMXFUT&2B4{7_1dfIEx8tYwWoIeQpVs&( z$nzcZ?vOP?_V4E`{<_ zWTBuuAL4siS?@G|z0}|I&O?!0K>K)dioOs(o~bB$C!Fufc6DbSGzqgL@4R6ykE%0u_y`hdgVsshiPZd!=yoDkms*i4ZW-101 zr-rG22z zS60qmoGvb)eote6`a@yh@i5Eo!S@c-Ui;LaXj1(%%fLUi=xLq5OzRJqP(R&|v$v{$ z+E4YzQU8gCNxyl281zF*2i1%Cy^6evB1MJ2rT=k$RO#8%8X5SPA=D=q(VtXWi+I=h zzbMZK7z~8M-%`Im{%F4%^2;phpK2)loyA{BMEg_w(VZ4n@N^#kR6oSPJnHxCbN!%C z(SMPmQ1I+iKg2*}86-_MWO>T@soTF)zQO-60{(Acp#dp{PQ^(fIw`u2^5Av4)qcR^ zTU7R=de_Fm^AX@bkyG*P;-KX`H{l6ikmrZRgjqkuN9WUg!0H>Ges5o#9JSMnDHnUq#Zx`yPa+{l{=$D`) e@|T<=?CVeXr#tZPn-2Fs;TLBFVFCUJNB;-H&C0I; literal 0 HcmV?d00001 diff --git a/tests/serialization/test_arrow_compat.py b/tests/serialization/test_arrow_compat.py index f5a99467..10e0dedc 100644 --- a/tests/serialization/test_arrow_compat.py +++ b/tests/serialization/test_arrow_compat.py @@ -27,6 +27,7 @@ "9.0.0", "10.0.1", "11.0.0", + "13.0.0", ] From 5715758e4dc0b51d604c96963eacf8dfa96a788c Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Tue, 19 Sep 2023 10:57:47 +0100 Subject: [PATCH 07/41] Change package version in docs to match environment.yml --- docs/environment-docs.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/environment-docs.yml b/docs/environment-docs.yml index c130e24e..6cdcc4d3 100644 --- a/docs/environment-docs.yml +++ b/docs/environment-docs.yml @@ -3,12 +3,12 @@ channels: - conda-forge dependencies: - python>=3.8 - - dask[dataframe] + - dask[dataframe]!=2023.9.2 - decorator - msgpack-python>=0.5.2 # Currently dask and numpy==1.16.0 clash - numpy!=1.15.0,!=1.16.0 - - pandas>=0.23.0, !=1.0.0 + - pandas>=0.23.0, !=1.0.0,!=2.1.0 - pyarrow>=0.17.1,!=1.0.0 - simplejson - minimalkv From 1dbe22951d8ee0eb2cc95d04882f6c04478d537a Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Tue, 19 Sep 2023 17:08:15 +0100 Subject: [PATCH 08/41] Only use coerce timestamps arg on pyarrow>=13 --- plateau/core/common_metadata.py | 17 +++++++++++---- plateau/core/index.py | 28 ++++++++++++++++++++----- plateau/serialization/_csv.py | 10 ++++++++- plateau/serialization/_parquet.py | 35 ++++++++++++++++++++++--------- 4 files changed, 70 insertions(+), 20 deletions(-) diff --git a/plateau/core/common_metadata.py b/plateau/core/common_metadata.py index 9cc6224d..c5433f6c 100644 --- a/plateau/core/common_metadata.py +++ b/plateau/core/common_metadata.py @@ -10,6 +10,7 @@ import pyarrow.parquet as pq import simplejson from minimalkv import KeyValueStore +from packaging import version from plateau.core import naming from plateau.core._compat import load_json @@ -29,6 +30,8 @@ "normalize_column_order", ) +PYARROW_LT_13 = version.parse(pa.__version__) < version.parse("13") + class SchemaWrapper: """Wrapper object for pyarrow.Schema to handle forwards and backwards @@ -753,6 +756,7 @@ def empty_dataframe_from_schema( Cast dates to objects. coerce_temporal_nanoseconds: bool Coerce date32, date64, duration and timestamp units to nanoseconds to retain behaviour of pandas 1.x. + Only applicable to pandas version >= 2.0 and PyArrow version >= 13.0.0. Returns ------- @@ -762,10 +766,15 @@ def empty_dataframe_from_schema( # HACK: Cast bytes to object in metadata until Pandas bug is fixed: https://github.com/pandas-dev/pandas/issues/50127 schema = schema_metadata_bytes_to_object(schema.internal()) - df = schema.empty_table().to_pandas( - date_as_object=date_as_object, - coerce_temporal_nanoseconds=coerce_temporal_nanoseconds, - ) + if PYARROW_LT_13: + # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist + # as it was introduced for backwards compatibility with pandas 1.x + df = schema.empty_table().to_pandas(date_as_object=date_as_object) + else: + df = schema.empty_table().to_pandas( + date_as_object=date_as_object, + coerce_temporal_nanoseconds=coerce_temporal_nanoseconds, + ) df.columns = df.columns.map(ensure_string_type) if columns is not None: diff --git a/plateau/core/index.py b/plateau/core/index.py index 3cfaa759..d6a79aeb 100644 --- a/plateau/core/index.py +++ b/plateau/core/index.py @@ -6,6 +6,7 @@ import pandas as pd import pyarrow as pa import pyarrow.parquet as pq +from packaging import version from toolz.itertoolz import partition_all import plateau.core._time @@ -37,6 +38,8 @@ "PartitionIndex", ) +PYARROW_LT_13 = version.parse(pa.__version__) < version.parse("13") + class IndexBase(CopyMixin): """Initialize an IndexBase. @@ -142,8 +145,13 @@ def observed_values( """Return an array of all observed values.""" keys = np.array(list(self.index_dct.keys())) labeled_array = pa.array(keys, type=self.dtype) + + # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist + # as it was introduced for backwards compatibility with pandas 1.x return np.array( - labeled_array.to_pandas( + labeled_array.to_pandas(date_as_object=date_as_object) + if PYARROW_LT_13 + else labeled_array.to_pandas( date_as_object=date_as_object, coerce_temporal_nanoseconds=coerce_temporal_nanoseconds, ) @@ -483,9 +491,14 @@ def as_flat_series( table = _index_dct_to_table( self.index_dct, column=self.column, dtype=self.dtype ) - df = table.to_pandas( - date_as_object=date_as_object, coerce_temporal_nanoseconds=True - ) + if PYARROW_LT_13: + # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist + # as it was introduced for backwards compatibility with pandas 1.x + df = table.to_pandas(date_as_object=date_as_object) + else: + df = table.to_pandas( + date_as_object=date_as_object, coerce_temporal_nanoseconds=True + ) if predicates is not None: # If there is a conjunction without any reference to the index @@ -871,7 +884,12 @@ def _parquet_bytes_to_dict(column: str, index_buffer: bytes): if column_type == pa.timestamp("us"): column_type = pa.timestamp("ns") - df = table.to_pandas(coerce_temporal_nanoseconds=True) + if PYARROW_LT_13: + # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist + # as it was introduced for backwards compatibility with pandas 1.x + df = table.to_pandas() + else: + df = table.to_pandas(coerce_temporal_nanoseconds=True) index_dct = dict( zip(df[column].values, (list(x) for x in df[_PARTITION_COLUMN_NAME].values)) diff --git a/plateau/serialization/_csv.py b/plateau/serialization/_csv.py index 0cab490e..f8be37ab 100644 --- a/plateau/serialization/_csv.py +++ b/plateau/serialization/_csv.py @@ -8,6 +8,7 @@ import pandas as pd import pyarrow as pa from minimalkv import KeyValueStore +from packaging import version from pandas.errors import EmptyDataError from ._generic import ( @@ -18,6 +19,8 @@ filter_df_from_predicates, ) +PYARROW_LT_13 = version.parse(pa.__version__) < version.parse("13") + class CsvSerializer(DataFrameSerializer): def __init__(self, compress=True): @@ -85,7 +88,12 @@ def restore_dataframe( def store(self, store, key_prefix, df): if isinstance(df, pa.Table): - df = df.to_pandas(coerce_temporal_nanoseconds=True) + if PYARROW_LT_13: + # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist + # as it was introduced for backwards compatibility with pandas 1.x + df = df.to_pandas() + else: + df = df.to_pandas(coerce_temporal_nanoseconds=True) key = f"{key_prefix}.csv" result_stream = BytesIO() iostream: BufferedIOBase diff --git a/plateau/serialization/_parquet.py b/plateau/serialization/_parquet.py index 553547dd..e70b9cee 100644 --- a/plateau/serialization/_parquet.py +++ b/plateau/serialization/_parquet.py @@ -41,6 +41,7 @@ BACKOFF_TIME = 0.01 # 10 ms PYARROW_LT_6 = version.parse(pa.__version__) < version.parse("6") PYARROW_LT_8 = version.parse(pa.__version__) < version.parse("8") +PYARROW_LT_13 = version.parse(pa.__version__) < version.parse("13") # Since pyarrow 6, the Parquet version/features can be selected more granular. # Version 2.0 is equal to 2.4 but 2.4 doesn't trigger deprecation warnings. @@ -256,14 +257,23 @@ def _restore_dataframe( # ARROW-5139 Column projection with empty columns returns a table w/out index if columns == []: # Create an arrow table with expected index length. - df = ( - parquet_file.schema.to_arrow_schema() - .empty_table() - .to_pandas( - date_as_object=date_as_object, - coerce_temporal_nanoseconds=True, + if PYARROW_LT_13: + # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist + # as it was introduced for backwards compatibility with pandas 1.x + df = ( + parquet_file.schema.to_arrow_schema() + .empty_table() + .to_pandas(date_as_object=date_as_object) + ) + else: + df = ( + parquet_file.schema.to_arrow_schema() + .empty_table() + .to_pandas( + date_as_object=date_as_object, + coerce_temporal_nanoseconds=True, + ) ) - ) index = pd.Index( pd.RangeIndex(start=0, stop=parquet_file.metadata.num_rows), dtype="int64", @@ -290,9 +300,14 @@ def _restore_dataframe( # HACK: Cast bytes to object in metadata until Pandas bug is fixed: https://github.com/pandas-dev/pandas/issues/50127 table = table.cast(schema_metadata_bytes_to_object(table.schema)) - df = table.to_pandas( - date_as_object=date_as_object, coerce_temporal_nanoseconds=True - ) + if PYARROW_LT_13: + # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist + # as it was introduced for backwards compatibility with pandas 1.x + df = table.to_pandas(date_as_object=date_as_object) + else: + df = table.to_pandas( + date_as_object=date_as_object, coerce_temporal_nanoseconds=True + ) # XXX: Patch until Pyarrow bug is resolved: https://issues.apache.org/jira/browse/ARROW-18099?filter=-2 if categories: From 062f28375b79d1a5f623bc267b09438bebd2221b Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Wed, 20 Sep 2023 09:43:25 +0100 Subject: [PATCH 09/41] Remove pyarrow<8 tests from ci.yml --- .github/workflows/ci.yml | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 52a302f4..124630d4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,25 +25,13 @@ jobs: matrix: numfocus_nightly: [false] os: ["ubuntu-latest"] - pyarrow: ["3.0.0", "4.0.1", "nightly"] + pyarrow: ["nightly"] python: ["3.8"] include: - numfocus_nightly: true os: "ubuntu-latest" - pyarrow: "4.0.1" - python: "3.10" - - numfocus_nightly: false - os: "ubuntu-latest" - pyarrow: "5.0.0" - python: "3.9" - - numfocus_nightly: false - os: "ubuntu-latest" - pyarrow: "6.0.1" - python: "3.9" - - numfocus_nightly: false - os: "ubuntu-latest" - pyarrow: "7.0.0" - python: "3.10" + pyarrow: "13.0.0" + python: "3.11" - numfocus_nightly: false os: "ubuntu-latest" pyarrow: "8.0.1" @@ -60,10 +48,18 @@ jobs: os: "ubuntu-latest" pyarrow: "11.0.0" python: "3.11" + - numfocus_nightly: false + os: "ubuntu-latest" + pyarrow: "12.0.0" + python: "3.11" + - numfocus_nightly: false + os: "ubuntu-latest" + pyarrow: "13.0.0" + python: "3.11" - numfocus_nightly: false os: "macos-latest" - pyarrow: "4.0.1" - python: "3.8" + pyarrow: "13.0.0" + python: "3.11" continue-on-error: ${{ matrix.numfocus_nightly || matrix.pyarrow == 'nightly' }} runs-on: ${{ matrix.os }} From f909ffeaf218e39e4f59d607c34c3de80916388e Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Wed, 20 Sep 2023 10:02:55 +0100 Subject: [PATCH 10/41] Generate arrow-compat reference data for 12.0.0 --- reference-data/arrow-compat/12.0.0.parquet | Bin 0 -> 18562 bytes tests/serialization/test_arrow_compat.py | 1 + 2 files changed, 1 insertion(+) create mode 100644 reference-data/arrow-compat/12.0.0.parquet diff --git a/reference-data/arrow-compat/12.0.0.parquet b/reference-data/arrow-compat/12.0.0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..33c0b3b4de07c72b1e9e76e975b435dac1e59256 GIT binary patch literal 18562 zcmcg!Z){uD6@LyP4JC|FYR`eJWZ806)=>Y+pC(-gJlnD3)J~H$wVfbT`Pq)+)OJGb z#BrcfH8F%HK1>K9G$Ax0H0@7?5JFQmp^5IlG;`&c2wl%PAZagzi2;xb}ooaGr1ukHT2L zv&Y#p=fQuFxPPh(KD*!#eBR~g63BJOptE;iqF?A4n7S{$y`8RKUMp^< zt7HD|Vxdy)E>v?RXP2wX^}56LmVw$gj}bnoN_1I|BK4M|qHK zg6Hjf$P4!z7_uRJaJBejv2?FOajQ@0O6w>vTf^tZeS^+!i{)*C=k-2v>s|;Rt85#M zcf=Dn`!!D-RfA`W8#^D~Pkwp-fgRB_+$Wod_|G5IQ1&sDJG>Zw-r;)5L4JLVeEYzl zN$l-e(JEm`NQ#+u(cGxfR7VU++cm#tp%&jhL&h$dE$-lk}q6{ zlPA7ow*31;nl0s2hJH4OBQxb!1LQ9QEt#^<*{>WWJ;(8Q`RG-1CV-W8ee|ju{&J1? z()R|Pea3@gr(DQvW^<;Ua?Rm-(LvrBB+m~bG({%IVuJHK&SQ803+w=PINT10OE^h? z1BfGTdfYl3+qk57q_OGI{sJIpEQ=4ZuwJBOjqBoukUSkH`8! z=1)KSQ_pJe1ONG1ZcaGz_0fKqdd%rU5UdIS#Qo+uZp#P=x@SlR2erV*-&bt|`0CXh z#rKKB^{IorKSaKK1Px&k!V$|clBsfeE<|&<0kgXsN7*NsPxofR`N|K~;Bt3o(=yIq zI9#te$Xh=k-wZ=)vlYd~X4s;_wZH!M_vyRQ{6>7e3I)mqc@E%IpS4jF-Y`RILDI$Oa3) zeTsbhqmER3M)>yim=W&l9xaCNrYE4zzSmEa=TEoge7-$!pZSd7#{4>P|Mrrb-UEX( zIA5uptUFAhXw?-u-#Th?{*7^MhoLnMuK&##`P$!+dhPbC`|=shy4`pv>Wus5B>CUO zfsG62L=0?+mqN^1@^QpY0vM7HAVDB6PU}>ccZ$jhlpvo@k$0ya31b0s*vz{xZos+^um)qFRfYPWZ zb`HXKqT5YNJ?v8Sj&!+ci^Du0{lS@kR6k#TMd!}$Qf7|~#jZct{{SlQa%*fDg&ZX+ zn{l=cq;5qC&emQ08jBt|0f>5KM7m3aZ?XxW8&`sP2DIt;nk{AIA>bRb@-xe4UMm> z=thpmUOF~73KXDWY$>&q;KaybZBdc=3JI zg}~(OI-+_=8y)j&_NSQToTFKnDW`jXM714Vy0RT*%ecA*1_!=*xZ{08a_2jGq|^O3 z(&=s)xzl|t(&4TX=}5nbbfnWmI@*^Zo#?2L_IE>Wv&YfN-UDfKzkuA4`#fp?9h`Lj zj!fEpuO;oii;_;?6-k@#cL%?;k#=0VNINb~q^%bn(uRu)IqX%#b?bzyi}bo~4jmVS zfl+tQR=S)`SKMzt{OoX{oXzcB&TJNUO6AJ%*`cSN9lio%@HD)gF6VQ@qeH{xbSVb| zSi8YnrJCNZ!mF1)Z*IY(k3Qz;mK&kGVv_qg0dHN~w@Y=`x z4YV_&jrKAB;kDaaAC_ltdrCaK_NfL-r!SQVH9ScPOgErWLB=MrG=7 z6IHxK2)x`3sE%7yvFS}z@yrM!xkgmSEvi~FASbzjL|#@?hAn#Nu7Z%z$M;&9Zs%st@D3%u*+BioYW3Z}g z9vvu_6CUa?M;V=N)zvgYbTqlig_`Zr(=PDfnLMG8I?YtpJ(szNu^MFR1|9>@WF2s60C)Q z2JfID3iAVJqNU=_Y9uk9^3U%|vx%xyju$Qz0#dF%x1CN-T#0UM=A-4gb*YpsMhmrk zv^23UB`@KeoosTi5(|oj%-GU;ro0GaVoiSE%><*9(a=mK8l2nDMiMoY6HocFNMG?! zSF%apPRh@8uclO;J1a?F5$8(fL`4dszSXVO{7YL> zDMRHY5=)WTa=x%;!k($a{BRZQr{IEhrRM4@sp8sdX>WHWwYVQIP#vXoGErF#PpfN- zmcsQ+DLj$#uT~^~*c;!E7O34g>yaYG{cN~ek|NXV*-*Tmx}9}JCVWz4Z#ou4f7{~q z&oi4e#XG;T*dpE)Zzu3VTzccnD-^?OsaTXEOKVaoTm_#`g3loq=hngJTT*#(Z=U&} zwgR!dlA2%3&K7I(yh=JX?@dAsW zcW%#3WMvz0{94GRmROC%jXUH`z-d5hD4&!HOz7(t$v^kZYI0tVZ(lJR3B!4{w64}d zv)&5UC%SVE>2ghG%`;fj$TbizwsgEu8?*^aHd3svF5B->iYW!PRq|4!bA)1}vTSP| z^i#r{hFWocV@Zq3{K1&WwaLejn6cFJ;uYNr$b#|&CC|{b(p?RYspSjo_$?~@*34sdOF8GxK`y% zY$={lUIU+@yhi7dj$X3H5w2D7ck8~gJ$n%AIE{O5jzTPq*Tzqbx73ndz_xo8yFJG4 zcl3V5+h0Y%Hv)Nlp9c72dKvotF`T;Q6=WvFpK|C`%DEq+xI#N+>fJt;aURop^n28oEXiY#k{%9v&rppB7g{agNfiK+g&~ zPM1V6T!Y?KXdHV@SFj-!tU<3T6a}J~7UM9Q55ao|+fo&5PesKb^rPbVK8x=cYQP#C zN7i+0bA@T!DueG+s1!wrL3k5UJBY#$oCxi&g|f$k?XSg4lukjjrWlgN0McgFnpTu> zzJFuF<3Y(YK`I0nBFn=E5!WOJSS;adZ2Jt%2cV)@RP{yq`cTv?%^!Yj(oO8{dOTj_ z42h@KJRW!mz#{R7E`|2z=qt0gk_TYuqG!S5p-M08H^Gad7+&a=fT}Z7fS2K0*p}@x z!^exdX&$9LL~#k{dqb0r?SrBM^n5{fF%Uxg&?j&#tiKsQwJ$qMasG_PPeGnrW&7+hBPYr&|>&w|PvUvw#ypCSuP%JU(< zmzDL-@YhTIP47Gu$py5JC!^>K@Z*_^qIWXNBt|iw0}K1|dTSU2kV^1}0m7$Q{19=& z1eD&Mnm>`Ufc*O#7~k+V6K;Ta3+@e7Ca`r$1Q3Q&D?(=$^ss5mu9{Ui5m z<@>lehy14&6@HujONFH2o+eT8p@IioOZP);U%?xPM5&}QI-f3`CMlNXl)kcZ_TqGL z0rh)o2h$%20*{ATb|1cXp!PbT{&=104=)4%^rEM6{xYpUSU~-BKhEB+{uw{jA4C18 zY9{^W{bA4#DIHWV;`b`@CJN;h{)Ya?`4OdOPiti0UxrYhT10ZK{p=FRXRg>i@=cjJ}Quzk|!wC4lhJ^;C6gm|r1?Z&62FioinMV5ok8e@gkLq0; z2hWFq|5QfBvxkG0^W20dctM^Y6ys+76d#>W^8u@Gc>39ShjA^}E5HQa&10OyIx%<{ z5g+hir9xSLVe;uXW`%%~A0sk9{H8xjN%9lHJ+zhHer6|EeJr!J^%(sUbV&Y^bBKNY X3I8$&{`pdS{}Y}*BM1xde;@ikLi3(C literal 0 HcmV?d00001 diff --git a/tests/serialization/test_arrow_compat.py b/tests/serialization/test_arrow_compat.py index 10e0dedc..087d64ca 100644 --- a/tests/serialization/test_arrow_compat.py +++ b/tests/serialization/test_arrow_compat.py @@ -27,6 +27,7 @@ "9.0.0", "10.0.1", "11.0.0", + "12.0.0", "13.0.0", ] From f0f06622ef79cf696209abf14d474fff7cadc868 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Wed, 20 Sep 2023 10:09:46 +0100 Subject: [PATCH 11/41] Avoid pandas 2.1.0 in numfocus nightly ci test --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 124630d4..657ca4b8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,6 +30,7 @@ jobs: include: - numfocus_nightly: true os: "ubuntu-latest" + pandas: "2.0.3" # Avoid due to bug in Pandas 2.1.0 (#55014) pyarrow: "13.0.0" python: "3.11" - numfocus_nightly: false From 3c56634e8ac13d626716a4f0aea1e7870c2b3fa7 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Wed, 20 Sep 2023 10:55:31 +0100 Subject: [PATCH 12/41] Avoid pandas 2.1.0.* in numfocus_nightly pip install --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 657ca4b8..c5b4f7ae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,7 +30,6 @@ jobs: include: - numfocus_nightly: true os: "ubuntu-latest" - pandas: "2.0.3" # Avoid due to bug in Pandas 2.1.0 (#55014) pyarrow: "13.0.0" python: "3.11" - numfocus_nightly: false @@ -97,10 +96,11 @@ jobs: # nightlies and the latest release would otherwise work together. run: micromamba update -c arrow-nightlies -c conda-forge arrow-cpp pyarrow if: matrix.pyarrow == 'nightly' - - name: Pip Instal NumFOCUS nightly + - name: Pip Install NumFOCUS nightly # NumFOCUS nightly wheels, contains numpy and pandas # TODO(gh-45): Re-add numpy - run: python -m pip install --pre --upgrade --timeout=60 --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple pandas + # TODO: Remove pandas version stipulation once https://github.com/pandas-dev/pandas/issues/55014 is fixed + run: python -m pip install --pre --upgrade --timeout=60 --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple "pandas!=2.1.0.*" if: matrix.numfocus_nightly - name: Test import run: | From 60079ee98dc63817d2e2dedd3e1590197be95491 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Wed, 20 Sep 2023 14:57:10 +0100 Subject: [PATCH 13/41] Add changelog entry and update setup.cfg --- CHANGES.rst | 6 ++++++ setup.cfg | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 82747f66..f781b7d3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -2,6 +2,12 @@ Changelog ========= +Plateau 4.2.0 (unreleased) +========================== + +* Support pandas 2 +* No longer test for pyarrow < 8 + Plateau 4.1.5 (2023-03-14) ========================== diff --git a/setup.cfg b/setup.cfg index 8c54f0e8..d822c11f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,12 +13,12 @@ classifiers = [options] include_package_data = true install_requires = - dask[dataframe]!=2021.5.1,!=2021.6.0 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions + dask[dataframe]!=2021.5.1,!=2021.6.0,!=2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions decorator msgpack>=0.5.2 # Currently dask and numpy==1.16.0 clash numpy!=1.15.0,!=1.16.0 - pandas>=0.23.0, !=1.0.0 + pandas>=0.23.0,!=1.0.0,!=2.1.0 pyarrow>=0.17.1,!=1.0.0 simplejson minimalkv>=1.4.2 From 2ccb2feedab16aac2cbb3f681760f56a87e3ece7 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Wed, 20 Sep 2023 15:25:32 +0100 Subject: [PATCH 14/41] Shrink PR --- .github/workflows/ci.yml | 12 ++---------- reference-data/arrow-compat/12.0.0.parquet | Bin 18562 -> 0 bytes tests/serialization/test_arrow_compat.py | 1 - 3 files changed, 2 insertions(+), 11 deletions(-) delete mode 100644 reference-data/arrow-compat/12.0.0.parquet diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c5b4f7ae..351cbb35 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,7 +30,7 @@ jobs: include: - numfocus_nightly: true os: "ubuntu-latest" - pyarrow: "13.0.0" + pyarrow: "11.0.0" python: "3.11" - numfocus_nightly: false os: "ubuntu-latest" @@ -48,17 +48,9 @@ jobs: os: "ubuntu-latest" pyarrow: "11.0.0" python: "3.11" - - numfocus_nightly: false - os: "ubuntu-latest" - pyarrow: "12.0.0" - python: "3.11" - - numfocus_nightly: false - os: "ubuntu-latest" - pyarrow: "13.0.0" - python: "3.11" - numfocus_nightly: false os: "macos-latest" - pyarrow: "13.0.0" + pyarrow: "11.0.0" python: "3.11" continue-on-error: ${{ matrix.numfocus_nightly || matrix.pyarrow == 'nightly' }} diff --git a/reference-data/arrow-compat/12.0.0.parquet b/reference-data/arrow-compat/12.0.0.parquet deleted file mode 100644 index 33c0b3b4de07c72b1e9e76e975b435dac1e59256..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 18562 zcmcg!Z){uD6@LyP4JC|FYR`eJWZ806)=>Y+pC(-gJlnD3)J~H$wVfbT`Pq)+)OJGb z#BrcfH8F%HK1>K9G$Ax0H0@7?5JFQmp^5IlG;`&c2wl%PAZagzi2;xb}ooaGr1ukHT2L zv&Y#p=fQuFxPPh(KD*!#eBR~g63BJOptE;iqF?A4n7S{$y`8RKUMp^< zt7HD|Vxdy)E>v?RXP2wX^}56LmVw$gj}bnoN_1I|BK4M|qHK zg6Hjf$P4!z7_uRJaJBejv2?FOajQ@0O6w>vTf^tZeS^+!i{)*C=k-2v>s|;Rt85#M zcf=Dn`!!D-RfA`W8#^D~Pkwp-fgRB_+$Wod_|G5IQ1&sDJG>Zw-r;)5L4JLVeEYzl zN$l-e(JEm`NQ#+u(cGxfR7VU++cm#tp%&jhL&h$dE$-lk}q6{ zlPA7ow*31;nl0s2hJH4OBQxb!1LQ9QEt#^<*{>WWJ;(8Q`RG-1CV-W8ee|ju{&J1? z()R|Pea3@gr(DQvW^<;Ua?Rm-(LvrBB+m~bG({%IVuJHK&SQ803+w=PINT10OE^h? z1BfGTdfYl3+qk57q_OGI{sJIpEQ=4ZuwJBOjqBoukUSkH`8! z=1)KSQ_pJe1ONG1ZcaGz_0fKqdd%rU5UdIS#Qo+uZp#P=x@SlR2erV*-&bt|`0CXh z#rKKB^{IorKSaKK1Px&k!V$|clBsfeE<|&<0kgXsN7*NsPxofR`N|K~;Bt3o(=yIq zI9#te$Xh=k-wZ=)vlYd~X4s;_wZH!M_vyRQ{6>7e3I)mqc@E%IpS4jF-Y`RILDI$Oa3) zeTsbhqmER3M)>yim=W&l9xaCNrYE4zzSmEa=TEoge7-$!pZSd7#{4>P|Mrrb-UEX( zIA5uptUFAhXw?-u-#Th?{*7^MhoLnMuK&##`P$!+dhPbC`|=shy4`pv>Wus5B>CUO zfsG62L=0?+mqN^1@^QpY0vM7HAVDB6PU}>ccZ$jhlpvo@k$0ya31b0s*vz{xZos+^um)qFRfYPWZ zb`HXKqT5YNJ?v8Sj&!+ci^Du0{lS@kR6k#TMd!}$Qf7|~#jZct{{SlQa%*fDg&ZX+ zn{l=cq;5qC&emQ08jBt|0f>5KM7m3aZ?XxW8&`sP2DIt;nk{AIA>bRb@-xe4UMm> z=thpmUOF~73KXDWY$>&q;KaybZBdc=3JI zg}~(OI-+_=8y)j&_NSQToTFKnDW`jXM714Vy0RT*%ecA*1_!=*xZ{08a_2jGq|^O3 z(&=s)xzl|t(&4TX=}5nbbfnWmI@*^Zo#?2L_IE>Wv&YfN-UDfKzkuA4`#fp?9h`Lj zj!fEpuO;oii;_;?6-k@#cL%?;k#=0VNINb~q^%bn(uRu)IqX%#b?bzyi}bo~4jmVS zfl+tQR=S)`SKMzt{OoX{oXzcB&TJNUO6AJ%*`cSN9lio%@HD)gF6VQ@qeH{xbSVb| zSi8YnrJCNZ!mF1)Z*IY(k3Qz;mK&kGVv_qg0dHN~w@Y=`x z4YV_&jrKAB;kDaaAC_ltdrCaK_NfL-r!SQVH9ScPOgErWLB=MrG=7 z6IHxK2)x`3sE%7yvFS}z@yrM!xkgmSEvi~FASbzjL|#@?hAn#Nu7Z%z$M;&9Zs%st@D3%u*+BioYW3Z}g z9vvu_6CUa?M;V=N)zvgYbTqlig_`Zr(=PDfnLMG8I?YtpJ(szNu^MFR1|9>@WF2s60C)Q z2JfID3iAVJqNU=_Y9uk9^3U%|vx%xyju$Qz0#dF%x1CN-T#0UM=A-4gb*YpsMhmrk zv^23UB`@KeoosTi5(|oj%-GU;ro0GaVoiSE%><*9(a=mK8l2nDMiMoY6HocFNMG?! zSF%apPRh@8uclO;J1a?F5$8(fL`4dszSXVO{7YL> zDMRHY5=)WTa=x%;!k($a{BRZQr{IEhrRM4@sp8sdX>WHWwYVQIP#vXoGErF#PpfN- zmcsQ+DLj$#uT~^~*c;!E7O34g>yaYG{cN~ek|NXV*-*Tmx}9}JCVWz4Z#ou4f7{~q z&oi4e#XG;T*dpE)Zzu3VTzccnD-^?OsaTXEOKVaoTm_#`g3loq=hngJTT*#(Z=U&} zwgR!dlA2%3&K7I(yh=JX?@dAsW zcW%#3WMvz0{94GRmROC%jXUH`z-d5hD4&!HOz7(t$v^kZYI0tVZ(lJR3B!4{w64}d zv)&5UC%SVE>2ghG%`;fj$TbizwsgEu8?*^aHd3svF5B->iYW!PRq|4!bA)1}vTSP| z^i#r{hFWocV@Zq3{K1&WwaLejn6cFJ;uYNr$b#|&CC|{b(p?RYspSjo_$?~@*34sdOF8GxK`y% zY$={lUIU+@yhi7dj$X3H5w2D7ck8~gJ$n%AIE{O5jzTPq*Tzqbx73ndz_xo8yFJG4 zcl3V5+h0Y%Hv)Nlp9c72dKvotF`T;Q6=WvFpK|C`%DEq+xI#N+>fJt;aURop^n28oEXiY#k{%9v&rppB7g{agNfiK+g&~ zPM1V6T!Y?KXdHV@SFj-!tU<3T6a}J~7UM9Q55ao|+fo&5PesKb^rPbVK8x=cYQP#C zN7i+0bA@T!DueG+s1!wrL3k5UJBY#$oCxi&g|f$k?XSg4lukjjrWlgN0McgFnpTu> zzJFuF<3Y(YK`I0nBFn=E5!WOJSS;adZ2Jt%2cV)@RP{yq`cTv?%^!Yj(oO8{dOTj_ z42h@KJRW!mz#{R7E`|2z=qt0gk_TYuqG!S5p-M08H^Gad7+&a=fT}Z7fS2K0*p}@x z!^exdX&$9LL~#k{dqb0r?SrBM^n5{fF%Uxg&?j&#tiKsQwJ$qMasG_PPeGnrW&7+hBPYr&|>&w|PvUvw#ypCSuP%JU(< zmzDL-@YhTIP47Gu$py5JC!^>K@Z*_^qIWXNBt|iw0}K1|dTSU2kV^1}0m7$Q{19=& z1eD&Mnm>`Ufc*O#7~k+V6K;Ta3+@e7Ca`r$1Q3Q&D?(=$^ss5mu9{Ui5m z<@>lehy14&6@HujONFH2o+eT8p@IioOZP);U%?xPM5&}QI-f3`CMlNXl)kcZ_TqGL z0rh)o2h$%20*{ATb|1cXp!PbT{&=104=)4%^rEM6{xYpUSU~-BKhEB+{uw{jA4C18 zY9{^W{bA4#DIHWV;`b`@CJN;h{)Ya?`4OdOPiti0UxrYhT10ZK{p=FRXRg>i@=cjJ}Quzk|!wC4lhJ^;C6gm|r1?Z&62FioinMV5ok8e@gkLq0; z2hWFq|5QfBvxkG0^W20dctM^Y6ys+76d#>W^8u@Gc>39ShjA^}E5HQa&10OyIx%<{ z5g+hir9xSLVe;uXW`%%~A0sk9{H8xjN%9lHJ+zhHer6|EeJr!J^%(sUbV&Y^bBKNY X3I8$&{`pdS{}Y}*BM1xde;@ikLi3(C diff --git a/tests/serialization/test_arrow_compat.py b/tests/serialization/test_arrow_compat.py index 087d64ca..10e0dedc 100644 --- a/tests/serialization/test_arrow_compat.py +++ b/tests/serialization/test_arrow_compat.py @@ -27,7 +27,6 @@ "9.0.0", "10.0.1", "11.0.0", - "12.0.0", "13.0.0", ] From d52f76cfea307b7edb67eda1b561b605d3a34649 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Wed, 20 Sep 2023 16:32:03 +0100 Subject: [PATCH 15/41] Add dask tests for lines marked as uncovered by codecov --- tests/io/dask/dataframe/test_read.py | 40 ++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/io/dask/dataframe/test_read.py b/tests/io/dask/dataframe/test_read.py index 348e091f..e6480d50 100644 --- a/tests/io/dask/dataframe/test_read.py +++ b/tests/io/dask/dataframe/test_read.py @@ -277,3 +277,43 @@ def restore_dataframe(cls, store, key, filter_query, columns, *args, **kwargs): )["colA"] assert_dask_eq(ddf_auto, ddf_manual) assert fake_called + + +def test_dask_index_on_non_string_raises(store_factory): + dataset_uuid = "dataset_uuid" + colA = 1 + df1 = pd.DataFrame({colA: [1, 2]}) + store_dataframes_as_dataset( + store=store_factory, dataset_uuid=dataset_uuid, dfs=[df1] + ) + with pytest.raises( + TypeError, + match=f"The paramter `dask_index_on` must be a string but got {type(colA)}", + ): + read_dataset_as_ddf( + dataset_uuid=dataset_uuid, + store=store_factory, + table="table", + dask_index_on=colA, + ) + + +def test_dask_dispatch_by_raises_if_index_on_not_none(store_factory): + dataset_uuid = "dataset_uuid" + colA = "ColumnA" + df1 = pd.DataFrame({colA: [1, 2]}) + store_dataframes_as_dataset( + store=store_factory, dataset_uuid=dataset_uuid, dfs=[df1] + ) + with pytest.raises( + ValueError, + match="`read_dataset_as_ddf` got parameters `dask_index_on` and `dispatch_by`. " + "Note that `dispatch_by` can only be used if `dask_index_on` is None.", + ): + read_dataset_as_ddf( + dataset_uuid=dataset_uuid, + store=store_factory, + table="table", + dask_index_on=colA, + dispatch_by=[colA], + ) From fae951d1db1c36f0c64eafda57805fea48c8f1a7 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Thu, 21 Sep 2023 12:45:37 +0100 Subject: [PATCH 16/41] Check conda env before verbose import --- .github/workflows/ci.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 351cbb35..8a7c3203 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -94,12 +94,15 @@ jobs: # TODO: Remove pandas version stipulation once https://github.com/pandas-dev/pandas/issues/55014 is fixed run: python -m pip install --pre --upgrade --timeout=60 --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple "pandas!=2.1.0.*" if: matrix.numfocus_nightly + # TODO: Remove check conda env stage and -vvv from import + - name: Check conda env + run: mamba list - name: Test import run: | python -c "import plateau" python -c "import plateau.api" python -c "import plateau.api.dataset" - python -c "import plateau.api.serialization" + python -vvv -c "import plateau.api.serialization" python -c "import plateau.core" python -c "import plateau.io" python -c "import plateau.io_components" From 91ecebfd05f7148a91faee1a816ef24c55c0dc61 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Thu, 21 Sep 2023 12:51:43 +0100 Subject: [PATCH 17/41] Use micromamba instead of mamba --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8a7c3203..401d63bb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -96,7 +96,7 @@ jobs: if: matrix.numfocus_nightly # TODO: Remove check conda env stage and -vvv from import - name: Check conda env - run: mamba list + run: micromamba list - name: Test import run: | python -c "import plateau" From 62481925aacf614ebfd3a25530b3b082fcbb4524 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Thu, 21 Sep 2023 13:03:37 +0100 Subject: [PATCH 18/41] Pin pandas<2.1.0 due to bug in 2.1.0 and 2.1.1 --- .github/workflows/ci.yml | 2 +- docs/environment-docs.yml | 2 +- environment.yml | 2 +- setup.cfg | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 401d63bb..bdf5a8f3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -92,7 +92,7 @@ jobs: # NumFOCUS nightly wheels, contains numpy and pandas # TODO(gh-45): Re-add numpy # TODO: Remove pandas version stipulation once https://github.com/pandas-dev/pandas/issues/55014 is fixed - run: python -m pip install --pre --upgrade --timeout=60 --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple "pandas!=2.1.0.*" + run: python -m pip install --pre --upgrade --timeout=60 --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple "pandas<2.1.0" if: matrix.numfocus_nightly # TODO: Remove check conda env stage and -vvv from import - name: Check conda env diff --git a/docs/environment-docs.yml b/docs/environment-docs.yml index 6cdcc4d3..5c1f8528 100644 --- a/docs/environment-docs.yml +++ b/docs/environment-docs.yml @@ -8,7 +8,7 @@ dependencies: - msgpack-python>=0.5.2 # Currently dask and numpy==1.16.0 clash - numpy!=1.15.0,!=1.16.0 - - pandas>=0.23.0, !=1.0.0,!=2.1.0 + - pandas>=0.23.0, !=1.0.0, <2.1.0 - pyarrow>=0.17.1,!=1.0.0 - simplejson - minimalkv diff --git a/environment.yml b/environment.yml index 5ad2f3c8..6724228f 100644 --- a/environment.yml +++ b/environment.yml @@ -9,7 +9,7 @@ dependencies: # Currently dask and numpy==1.16.0 clash # TODO: add support for numpy>=1.23 - numpy!=1.15.0,!=1.16.0 - - pandas>=0.23.0,!=1.0.0,!=2.1.0 + - pandas>=0.23.0,!=1.0.0,<2.1.0 - pyarrow>=0.17.1,!=1.0.0 - simplejson - minimalkv>=1.4.2 diff --git a/setup.cfg b/setup.cfg index d822c11f..3bedce64 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,7 +18,7 @@ install_requires = msgpack>=0.5.2 # Currently dask and numpy==1.16.0 clash numpy!=1.15.0,!=1.16.0 - pandas>=0.23.0,!=1.0.0,!=2.1.0 + pandas>=0.23.0,!=1.0.0,<2.1.0 pyarrow>=0.17.1,!=1.0.0 simplejson minimalkv>=1.4.2 From 807650c684bb4ba250d7fee8001370b941f3195a Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Mon, 25 Sep 2023 13:40:57 +0100 Subject: [PATCH 19/41] Check if adding pyarrow 13 tests improves coverage --- .github/workflows/ci.yml | 16 ++++++++++++++++ reference-data/arrow-compat/12.0.0.parquet | Bin 0 -> 18562 bytes tests/serialization/test_arrow_compat.py | 1 + 3 files changed, 17 insertions(+) create mode 100644 reference-data/arrow-compat/12.0.0.parquet diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bdf5a8f3..ed2f8874 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,6 +32,10 @@ jobs: os: "ubuntu-latest" pyarrow: "11.0.0" python: "3.11" + - numfocus_nightly: true + os: "ubuntu-latest" + pyarrow: "13.0.0" + python: "3.11" - numfocus_nightly: false os: "ubuntu-latest" pyarrow: "8.0.1" @@ -48,10 +52,22 @@ jobs: os: "ubuntu-latest" pyarrow: "11.0.0" python: "3.11" + - numfocus_nightly: false + os: "ubuntu-latest" + pyarrow: "12.0.0" + python: "3.11" + - numfocus_nightly: false + os: "ubuntu-latest" + pyarrow: "13.0.0" + python: "3.11" - numfocus_nightly: false os: "macos-latest" pyarrow: "11.0.0" python: "3.11" + - numfocus_nightly: false + os: "macos-latest" + pyarrow: "13.0.0" + python: "3.11" continue-on-error: ${{ matrix.numfocus_nightly || matrix.pyarrow == 'nightly' }} runs-on: ${{ matrix.os }} diff --git a/reference-data/arrow-compat/12.0.0.parquet b/reference-data/arrow-compat/12.0.0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..33c0b3b4de07c72b1e9e76e975b435dac1e59256 GIT binary patch literal 18562 zcmcg!Z){uD6@LyP4JC|FYR`eJWZ806)=>Y+pC(-gJlnD3)J~H$wVfbT`Pq)+)OJGb z#BrcfH8F%HK1>K9G$Ax0H0@7?5JFQmp^5IlG;`&c2wl%PAZagzi2;xb}ooaGr1ukHT2L zv&Y#p=fQuFxPPh(KD*!#eBR~g63BJOptE;iqF?A4n7S{$y`8RKUMp^< zt7HD|Vxdy)E>v?RXP2wX^}56LmVw$gj}bnoN_1I|BK4M|qHK zg6Hjf$P4!z7_uRJaJBejv2?FOajQ@0O6w>vTf^tZeS^+!i{)*C=k-2v>s|;Rt85#M zcf=Dn`!!D-RfA`W8#^D~Pkwp-fgRB_+$Wod_|G5IQ1&sDJG>Zw-r;)5L4JLVeEYzl zN$l-e(JEm`NQ#+u(cGxfR7VU++cm#tp%&jhL&h$dE$-lk}q6{ zlPA7ow*31;nl0s2hJH4OBQxb!1LQ9QEt#^<*{>WWJ;(8Q`RG-1CV-W8ee|ju{&J1? z()R|Pea3@gr(DQvW^<;Ua?Rm-(LvrBB+m~bG({%IVuJHK&SQ803+w=PINT10OE^h? z1BfGTdfYl3+qk57q_OGI{sJIpEQ=4ZuwJBOjqBoukUSkH`8! z=1)KSQ_pJe1ONG1ZcaGz_0fKqdd%rU5UdIS#Qo+uZp#P=x@SlR2erV*-&bt|`0CXh z#rKKB^{IorKSaKK1Px&k!V$|clBsfeE<|&<0kgXsN7*NsPxofR`N|K~;Bt3o(=yIq zI9#te$Xh=k-wZ=)vlYd~X4s;_wZH!M_vyRQ{6>7e3I)mqc@E%IpS4jF-Y`RILDI$Oa3) zeTsbhqmER3M)>yim=W&l9xaCNrYE4zzSmEa=TEoge7-$!pZSd7#{4>P|Mrrb-UEX( zIA5uptUFAhXw?-u-#Th?{*7^MhoLnMuK&##`P$!+dhPbC`|=shy4`pv>Wus5B>CUO zfsG62L=0?+mqN^1@^QpY0vM7HAVDB6PU}>ccZ$jhlpvo@k$0ya31b0s*vz{xZos+^um)qFRfYPWZ zb`HXKqT5YNJ?v8Sj&!+ci^Du0{lS@kR6k#TMd!}$Qf7|~#jZct{{SlQa%*fDg&ZX+ zn{l=cq;5qC&emQ08jBt|0f>5KM7m3aZ?XxW8&`sP2DIt;nk{AIA>bRb@-xe4UMm> z=thpmUOF~73KXDWY$>&q;KaybZBdc=3JI zg}~(OI-+_=8y)j&_NSQToTFKnDW`jXM714Vy0RT*%ecA*1_!=*xZ{08a_2jGq|^O3 z(&=s)xzl|t(&4TX=}5nbbfnWmI@*^Zo#?2L_IE>Wv&YfN-UDfKzkuA4`#fp?9h`Lj zj!fEpuO;oii;_;?6-k@#cL%?;k#=0VNINb~q^%bn(uRu)IqX%#b?bzyi}bo~4jmVS zfl+tQR=S)`SKMzt{OoX{oXzcB&TJNUO6AJ%*`cSN9lio%@HD)gF6VQ@qeH{xbSVb| zSi8YnrJCNZ!mF1)Z*IY(k3Qz;mK&kGVv_qg0dHN~w@Y=`x z4YV_&jrKAB;kDaaAC_ltdrCaK_NfL-r!SQVH9ScPOgErWLB=MrG=7 z6IHxK2)x`3sE%7yvFS}z@yrM!xkgmSEvi~FASbzjL|#@?hAn#Nu7Z%z$M;&9Zs%st@D3%u*+BioYW3Z}g z9vvu_6CUa?M;V=N)zvgYbTqlig_`Zr(=PDfnLMG8I?YtpJ(szNu^MFR1|9>@WF2s60C)Q z2JfID3iAVJqNU=_Y9uk9^3U%|vx%xyju$Qz0#dF%x1CN-T#0UM=A-4gb*YpsMhmrk zv^23UB`@KeoosTi5(|oj%-GU;ro0GaVoiSE%><*9(a=mK8l2nDMiMoY6HocFNMG?! zSF%apPRh@8uclO;J1a?F5$8(fL`4dszSXVO{7YL> zDMRHY5=)WTa=x%;!k($a{BRZQr{IEhrRM4@sp8sdX>WHWwYVQIP#vXoGErF#PpfN- zmcsQ+DLj$#uT~^~*c;!E7O34g>yaYG{cN~ek|NXV*-*Tmx}9}JCVWz4Z#ou4f7{~q z&oi4e#XG;T*dpE)Zzu3VTzccnD-^?OsaTXEOKVaoTm_#`g3loq=hngJTT*#(Z=U&} zwgR!dlA2%3&K7I(yh=JX?@dAsW zcW%#3WMvz0{94GRmROC%jXUH`z-d5hD4&!HOz7(t$v^kZYI0tVZ(lJR3B!4{w64}d zv)&5UC%SVE>2ghG%`;fj$TbizwsgEu8?*^aHd3svF5B->iYW!PRq|4!bA)1}vTSP| z^i#r{hFWocV@Zq3{K1&WwaLejn6cFJ;uYNr$b#|&CC|{b(p?RYspSjo_$?~@*34sdOF8GxK`y% zY$={lUIU+@yhi7dj$X3H5w2D7ck8~gJ$n%AIE{O5jzTPq*Tzqbx73ndz_xo8yFJG4 zcl3V5+h0Y%Hv)Nlp9c72dKvotF`T;Q6=WvFpK|C`%DEq+xI#N+>fJt;aURop^n28oEXiY#k{%9v&rppB7g{agNfiK+g&~ zPM1V6T!Y?KXdHV@SFj-!tU<3T6a}J~7UM9Q55ao|+fo&5PesKb^rPbVK8x=cYQP#C zN7i+0bA@T!DueG+s1!wrL3k5UJBY#$oCxi&g|f$k?XSg4lukjjrWlgN0McgFnpTu> zzJFuF<3Y(YK`I0nBFn=E5!WOJSS;adZ2Jt%2cV)@RP{yq`cTv?%^!Yj(oO8{dOTj_ z42h@KJRW!mz#{R7E`|2z=qt0gk_TYuqG!S5p-M08H^Gad7+&a=fT}Z7fS2K0*p}@x z!^exdX&$9LL~#k{dqb0r?SrBM^n5{fF%Uxg&?j&#tiKsQwJ$qMasG_PPeGnrW&7+hBPYr&|>&w|PvUvw#ypCSuP%JU(< zmzDL-@YhTIP47Gu$py5JC!^>K@Z*_^qIWXNBt|iw0}K1|dTSU2kV^1}0m7$Q{19=& z1eD&Mnm>`Ufc*O#7~k+V6K;Ta3+@e7Ca`r$1Q3Q&D?(=$^ss5mu9{Ui5m z<@>lehy14&6@HujONFH2o+eT8p@IioOZP);U%?xPM5&}QI-f3`CMlNXl)kcZ_TqGL z0rh)o2h$%20*{ATb|1cXp!PbT{&=104=)4%^rEM6{xYpUSU~-BKhEB+{uw{jA4C18 zY9{^W{bA4#DIHWV;`b`@CJN;h{)Ya?`4OdOPiti0UxrYhT10ZK{p=FRXRg>i@=cjJ}Quzk|!wC4lhJ^;C6gm|r1?Z&62FioinMV5ok8e@gkLq0; z2hWFq|5QfBvxkG0^W20dctM^Y6ys+76d#>W^8u@Gc>39ShjA^}E5HQa&10OyIx%<{ z5g+hir9xSLVe;uXW`%%~A0sk9{H8xjN%9lHJ+zhHer6|EeJr!J^%(sUbV&Y^bBKNY X3I8$&{`pdS{}Y}*BM1xde;@ikLi3(C literal 0 HcmV?d00001 diff --git a/tests/serialization/test_arrow_compat.py b/tests/serialization/test_arrow_compat.py index 10e0dedc..087d64ca 100644 --- a/tests/serialization/test_arrow_compat.py +++ b/tests/serialization/test_arrow_compat.py @@ -27,6 +27,7 @@ "9.0.0", "10.0.1", "11.0.0", + "12.0.0", "13.0.0", ] From d45d1609d48399b47eb58ebeaee6439e46e95f19 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Mon, 25 Sep 2023 13:51:13 +0100 Subject: [PATCH 20/41] Fix yaml error --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ed2f8874..e330da0b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,7 +64,7 @@ jobs: os: "macos-latest" pyarrow: "11.0.0" python: "3.11" - - numfocus_nightly: false + - numfocus_nightly: false os: "macos-latest" pyarrow: "13.0.0" python: "3.11" From fdcecc353d7d665864f057c074876d5c8885483e Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Mon, 25 Sep 2023 17:01:18 +0100 Subject: [PATCH 21/41] Fix pyarrow install command and re-add removed tests --- .github/workflows/ci.yml | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e330da0b..a8bc60a8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,6 +28,10 @@ jobs: pyarrow: ["nightly"] python: ["3.8"] include: + - numfocus_nightly: true + os: "ubuntu-latest" + pyarrow: "4.0.1" + python: "3.10" - numfocus_nightly: true os: "ubuntu-latest" pyarrow: "11.0.0" @@ -36,6 +40,18 @@ jobs: os: "ubuntu-latest" pyarrow: "13.0.0" python: "3.11" + - numfocus_nightly: false + os: "ubuntu-latest" + pyarrow: "5.0.0" + python: "3.9" + - numfocus_nightly: false + os: "ubuntu-latest" + pyarrow: "6.0.1" + python: "3.9" + - numfocus_nightly: false + os: "ubuntu-latest" + pyarrow: "7.0.0" + python: "3.10" - numfocus_nightly: false os: "ubuntu-latest" pyarrow: "8.0.1" @@ -60,6 +76,10 @@ jobs: os: "ubuntu-latest" pyarrow: "13.0.0" python: "3.11" + - numfocus_nightly: false + os: "macos-latest" + pyarrow: "4.0.1" + python: "3.8" - numfocus_nightly: false os: "macos-latest" pyarrow: "11.0.0" @@ -96,7 +116,7 @@ jobs: - name: Install repository run: python -m pip install --no-build-isolation --no-deps --disable-pip-version-check -e . - name: Install Pyarrow (non-nightly) - run: micromamba install pyarrow==${{ matrix.pyarrow }} + run: micromamba install -y --no-py-pin pyarrow==${{ matrix.pyarrow }} "pandas<2.1.0" if: matrix.pyarrow != 'nightly' - name: Install Pyarrow (nightly) # Install both arrow-cpp and pyarrow to make sure that we have the @@ -110,15 +130,12 @@ jobs: # TODO: Remove pandas version stipulation once https://github.com/pandas-dev/pandas/issues/55014 is fixed run: python -m pip install --pre --upgrade --timeout=60 --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple "pandas<2.1.0" if: matrix.numfocus_nightly - # TODO: Remove check conda env stage and -vvv from import - - name: Check conda env - run: micromamba list - name: Test import run: | python -c "import plateau" python -c "import plateau.api" python -c "import plateau.api.dataset" - python -vvv -c "import plateau.api.serialization" + python -c "import plateau.api.serialization" python -c "import plateau.core" python -c "import plateau.io" python -c "import plateau.io_components" From 0414b7854e66bc3fe6c75b89934df4b9bd6069e7 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Mon, 25 Sep 2023 18:19:45 +0100 Subject: [PATCH 22/41] Experiment with tests for backwards compatibility --- .github/workflows/ci.yml | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a8bc60a8..75708758 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,67 +24,93 @@ jobs: fail-fast: false matrix: numfocus_nightly: [false] + backwards_compat: [false] os: ["ubuntu-latest"] pyarrow: ["nightly"] python: ["3.8"] include: - numfocus_nightly: true + backwards_compat: false os: "ubuntu-latest" pyarrow: "4.0.1" python: "3.10" - numfocus_nightly: true + backwards_compat: false os: "ubuntu-latest" pyarrow: "11.0.0" python: "3.11" - numfocus_nightly: true + backwards_compat: false os: "ubuntu-latest" pyarrow: "13.0.0" python: "3.11" - numfocus_nightly: false + backwards_compat: true + os: "ubuntu-latest" + pyarrow: "11.0.0" + python: "3.11" + - numfocus_nightly: false + backwards_compat: true + os: "ubuntu-latest" + pyarrow: "13.0.0" + python: "3.11" + - numfocus_nightly: false + backwards_compat: false os: "ubuntu-latest" pyarrow: "5.0.0" python: "3.9" - numfocus_nightly: false + backwards_compat: false os: "ubuntu-latest" pyarrow: "6.0.1" python: "3.9" - numfocus_nightly: false + backwards_compat: false os: "ubuntu-latest" pyarrow: "7.0.0" python: "3.10" - numfocus_nightly: false + backwards_compat: false os: "ubuntu-latest" pyarrow: "8.0.1" python: "3.10" - numfocus_nightly: false + backwards_compat: false os: "ubuntu-latest" pyarrow: "9.0.0" python: "3.10" - numfocus_nightly: false + backwards_compat: false os: "ubuntu-latest" pyarrow: "10.0.1" python: "3.11" - numfocus_nightly: false + backwards_compat: false os: "ubuntu-latest" pyarrow: "11.0.0" python: "3.11" - numfocus_nightly: false + backwards_compat: false os: "ubuntu-latest" pyarrow: "12.0.0" python: "3.11" - numfocus_nightly: false + backwards_compat: false os: "ubuntu-latest" pyarrow: "13.0.0" python: "3.11" - numfocus_nightly: false + backwards_compat: false os: "macos-latest" pyarrow: "4.0.1" python: "3.8" - numfocus_nightly: false + backwards_compat: false os: "macos-latest" pyarrow: "11.0.0" python: "3.11" - numfocus_nightly: false + backwards_compat: false os: "macos-latest" pyarrow: "13.0.0" python: "3.11" @@ -117,7 +143,10 @@ jobs: run: python -m pip install --no-build-isolation --no-deps --disable-pip-version-check -e . - name: Install Pyarrow (non-nightly) run: micromamba install -y --no-py-pin pyarrow==${{ matrix.pyarrow }} "pandas<2.1.0" - if: matrix.pyarrow != 'nightly' + if: matrix.pyarrow != 'nightly' && !matrix.backwards_compat + - name: Downgrade pandas<2 to test backwards compatibility + run: micromamba install -y "pandas<2" + if: matrix.backwards_compat - name: Install Pyarrow (nightly) # Install both arrow-cpp and pyarrow to make sure that we have the # latest nightly of both packages. It is sadly not guaranteed that the From 1c2352ddc11e6fceb7a16db75d9c3864d28568e6 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Tue, 26 Sep 2023 10:59:49 +0100 Subject: [PATCH 23/41] Allow install of specific pandas version --- .github/workflows/ci.yml | 59 ++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 39 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 75708758..30d00d7d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,94 +24,74 @@ jobs: fail-fast: false matrix: numfocus_nightly: [false] - backwards_compat: [false] os: ["ubuntu-latest"] + pandas: [""] pyarrow: ["nightly"] python: ["3.8"] include: - numfocus_nightly: true - backwards_compat: false - os: "ubuntu-latest" - pyarrow: "4.0.1" - python: "3.10" - - numfocus_nightly: true - backwards_compat: false - os: "ubuntu-latest" - pyarrow: "11.0.0" - python: "3.11" - - numfocus_nightly: true - backwards_compat: false os: "ubuntu-latest" + pandas: "" pyarrow: "13.0.0" python: "3.11" - numfocus_nightly: false - backwards_compat: true os: "ubuntu-latest" + pandas: "1.5.3" pyarrow: "11.0.0" python: "3.11" - numfocus_nightly: false - backwards_compat: true os: "ubuntu-latest" + pandas: "1.5.3" pyarrow: "13.0.0" python: "3.11" - numfocus_nightly: false - backwards_compat: false os: "ubuntu-latest" + pandas: "" pyarrow: "5.0.0" python: "3.9" - numfocus_nightly: false - backwards_compat: false os: "ubuntu-latest" + pandas: "" pyarrow: "6.0.1" python: "3.9" - numfocus_nightly: false - backwards_compat: false os: "ubuntu-latest" + pandas: "" pyarrow: "7.0.0" python: "3.10" - numfocus_nightly: false - backwards_compat: false os: "ubuntu-latest" + pandas: "" pyarrow: "8.0.1" python: "3.10" - numfocus_nightly: false - backwards_compat: false os: "ubuntu-latest" + pandas: "" pyarrow: "9.0.0" python: "3.10" - numfocus_nightly: false - backwards_compat: false os: "ubuntu-latest" + pandas: "" pyarrow: "10.0.1" python: "3.11" - numfocus_nightly: false - backwards_compat: false os: "ubuntu-latest" + pandas: "" pyarrow: "11.0.0" python: "3.11" - numfocus_nightly: false - backwards_compat: false os: "ubuntu-latest" + pandas: "" pyarrow: "12.0.0" python: "3.11" - numfocus_nightly: false - backwards_compat: false os: "ubuntu-latest" + pandas: "" pyarrow: "13.0.0" python: "3.11" - numfocus_nightly: false - backwards_compat: false - os: "macos-latest" - pyarrow: "4.0.1" - python: "3.8" - - numfocus_nightly: false - backwards_compat: false - os: "macos-latest" - pyarrow: "11.0.0" - python: "3.11" - - numfocus_nightly: false - backwards_compat: false os: "macos-latest" + pandas: "" pyarrow: "13.0.0" python: "3.11" continue-on-error: ${{ matrix.numfocus_nightly || matrix.pyarrow == 'nightly' }} @@ -142,21 +122,22 @@ jobs: - name: Install repository run: python -m pip install --no-build-isolation --no-deps --disable-pip-version-check -e . - name: Install Pyarrow (non-nightly) + # Don't pin python as older versions of pyarrow require older versions of python run: micromamba install -y --no-py-pin pyarrow==${{ matrix.pyarrow }} "pandas<2.1.0" - if: matrix.pyarrow != 'nightly' && !matrix.backwards_compat - - name: Downgrade pandas<2 to test backwards compatibility - run: micromamba install -y "pandas<2" - if: matrix.backwards_compat + if: matrix.pyarrow != 'nightly' && matrix.pandas = '' - name: Install Pyarrow (nightly) # Install both arrow-cpp and pyarrow to make sure that we have the # latest nightly of both packages. It is sadly not guaranteed that the # nightlies and the latest release would otherwise work together. run: micromamba update -c arrow-nightlies -c conda-forge arrow-cpp pyarrow if: matrix.pyarrow == 'nightly' + - name: Install Pyarrow (downgrade pandas) + run: micromamba install -y pyarrow==${{ matrix.pyarrow }} pandas==${{ matrix.pandas }} + if: matrix.pyarrow != 'nightly' && matrix.pandas != '' - name: Pip Install NumFOCUS nightly # NumFOCUS nightly wheels, contains numpy and pandas # TODO(gh-45): Re-add numpy - # TODO: Remove pandas version stipulation once https://github.com/pandas-dev/pandas/issues/55014 is fixed + # TODO: Remove pandas version pin once https://github.com/pandas-dev/pandas/issues/55014 is fixed run: python -m pip install --pre --upgrade --timeout=60 --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple "pandas<2.1.0" if: matrix.numfocus_nightly - name: Test import From 84cdbdb136629e5f360ab9304152e8b3a830b3e3 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Tue, 26 Sep 2023 11:09:47 +0100 Subject: [PATCH 24/41] Fix yaml error --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 30d00d7d..4735cf91 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -124,7 +124,7 @@ jobs: - name: Install Pyarrow (non-nightly) # Don't pin python as older versions of pyarrow require older versions of python run: micromamba install -y --no-py-pin pyarrow==${{ matrix.pyarrow }} "pandas<2.1.0" - if: matrix.pyarrow != 'nightly' && matrix.pandas = '' + if: matrix.pyarrow != 'nightly' && matrix.pandas == '' - name: Install Pyarrow (nightly) # Install both arrow-cpp and pyarrow to make sure that we have the # latest nightly of both packages. It is sadly not guaranteed that the From c14edda8c6f9afb6efec2ce59ae304e7039fc37c Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Tue, 26 Sep 2023 11:32:02 +0100 Subject: [PATCH 25/41] Update changelog and re-add pyarrow 4.0.1 to ci.yml --- .github/workflows/ci.yml | 12 ++++++------ CHANGES.rst | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4735cf91..2358f186 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,18 +26,18 @@ jobs: numfocus_nightly: [false] os: ["ubuntu-latest"] pandas: [""] - pyarrow: ["nightly"] + pyarrow: ["3.0.0", "4.0.1", "nightly"] python: ["3.8"] include: - numfocus_nightly: true os: "ubuntu-latest" pandas: "" - pyarrow: "13.0.0" - python: "3.11" + pyarrow: "4.0.1" + python: "3.10" - numfocus_nightly: false os: "ubuntu-latest" pandas: "1.5.3" - pyarrow: "11.0.0" + pyarrow: "4.0.1" python: "3.11" - numfocus_nightly: false os: "ubuntu-latest" @@ -92,8 +92,8 @@ jobs: - numfocus_nightly: false os: "macos-latest" pandas: "" - pyarrow: "13.0.0" - python: "3.11" + pyarrow: "4.0.1" + python: "3.8" continue-on-error: ${{ matrix.numfocus_nightly || matrix.pyarrow == 'nightly' }} runs-on: ${{ matrix.os }} diff --git a/CHANGES.rst b/CHANGES.rst index f781b7d3..45820f55 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,7 +6,8 @@ Plateau 4.2.0 (unreleased) ========================== * Support pandas 2 -* No longer test for pyarrow < 8 +* Test pyarrow 12 and 13 +* Prevent dask from casting all object dtypes to strings Plateau 4.1.5 (2023-03-14) ========================== From 2c80c10b09a158e588d44ed30d5677a2b903ccc1 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Tue, 26 Sep 2023 11:54:39 +0100 Subject: [PATCH 26/41] Remove test for pyarrow==3.0.0 as incompatible with pandas 2 --- .github/workflows/ci.yml | 2 +- CHANGES.rst | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2358f186..ab59f764 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,7 @@ jobs: numfocus_nightly: [false] os: ["ubuntu-latest"] pandas: [""] - pyarrow: ["3.0.0", "4.0.1", "nightly"] + pyarrow: ["4.0.1", "nightly"] python: ["3.8"] include: - numfocus_nightly: true diff --git a/CHANGES.rst b/CHANGES.rst index 45820f55..952a02d7 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -8,6 +8,7 @@ Plateau 4.2.0 (unreleased) * Support pandas 2 * Test pyarrow 12 and 13 * Prevent dask from casting all object dtypes to strings +* Remove tests for pyarrow<=3 as they fail with pandas>=2 Plateau 4.1.5 (2023-03-14) ========================== From 657704a390df6ebb74f4faf94160e3b751df4a76 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Tue, 26 Sep 2023 11:55:06 +0100 Subject: [PATCH 27/41] asv no longer supports dev, use run instead --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ab59f764..329cfdc2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -160,7 +160,7 @@ jobs: - name: Running benchmarks run: | asv --config ./asv_bench/asv.conf.json machine --machine github --os unknown --arch unknown --cpu unknown --ram unknown - asv --config ./asv_bench/asv.conf.json dev | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log + asv --config ./asv_bench/asv.conf.json run | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log if grep "failed" benchmarks.log > /dev/null ; then exit 1 fi From 1f9720716a3a7d11d53510dbdffc058b62d4dfdb Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Tue, 26 Sep 2023 12:27:56 +0100 Subject: [PATCH 28/41] Add environment arg to asv run --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 329cfdc2..abe63090 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -160,7 +160,7 @@ jobs: - name: Running benchmarks run: | asv --config ./asv_bench/asv.conf.json machine --machine github --os unknown --arch unknown --cpu unknown --ram unknown - asv --config ./asv_bench/asv.conf.json run | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log + asv --config ./asv_bench/asv.conf.json run -E existing:same | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log if grep "failed" benchmarks.log > /dev/null ; then exit 1 fi From c0218c339c8e18b96a003c76893c5ac86ab75ae8 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Tue, 26 Sep 2023 14:16:59 +0100 Subject: [PATCH 29/41] Use astype when seting Series type due to change in pandas behaviour --- plateau/serialization/testing.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/plateau/serialization/testing.py b/plateau/serialization/testing.py index b3cd33f2..ad1bf995 100644 --- a/plateau/serialization/testing.py +++ b/plateau/serialization/testing.py @@ -36,16 +36,16 @@ def get_dataframe_not_nested(n): "bool": pd.Series( [1] * int(np.floor(n / 2)) + [0] * int(np.ceil(n / 2)), dtype=np.bool_ ), - "int8": pd.Series(range(n), dtype=np.int8), - "int16": pd.Series(range(n), dtype=np.int16), - "int32": pd.Series(range(n), dtype=np.int32), - "int64": pd.Series(range(n), dtype=np.int64), - "uint8": pd.Series(range(n), dtype=np.uint8), - "uint16": pd.Series(range(n), dtype=np.uint16), - "uint32": pd.Series(range(n), dtype=np.uint32), - "uint64": pd.Series(range(n), dtype=np.uint64), - "float32": pd.Series([float(x) for x in range(n)], dtype=np.float32), - "float64": pd.Series([float(x) for x in range(n)], dtype=np.float64), + "int8": pd.Series(range(n)).astype(np.int8), + "int16": pd.Series(range(n)).astype(np.int16), + "int32": pd.Series(range(n)).astype(np.int32), + "int64": pd.Series(range(n)).astype(np.int64), + "uint8": pd.Series(range(n)).astype(np.uint8), + "uint16": pd.Series(range(n)).astype(np.uint16), + "uint32": pd.Series(range(n)).astype(np.uint32), + "uint64": pd.Series(range(n)).astype(np.uint64), + "float32": pd.Series([float(x) for x in range(n)]).astype(np.float32), + "float64": pd.Series([float(x) for x in range(n)]).astype(np.float64), "date": pd.Series( [date(2018, 1, x % 31 + 1) for x in range(1, n + 1)], dtype=object ), From efb827fb880a712b4d5e3305757f04767e59d778 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Tue, 26 Sep 2023 15:03:19 +0100 Subject: [PATCH 30/41] Pin dask<2023.9.2 --- docs/environment-docs.yml | 4 ++-- environment.yml | 3 ++- setup.cfg | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/environment-docs.yml b/docs/environment-docs.yml index 5c1f8528..21ad1776 100644 --- a/docs/environment-docs.yml +++ b/docs/environment-docs.yml @@ -3,12 +3,12 @@ channels: - conda-forge dependencies: - python>=3.8 - - dask[dataframe]!=2023.9.2 + - dask[dataframe]<2023.9.2 - decorator - msgpack-python>=0.5.2 # Currently dask and numpy==1.16.0 clash - numpy!=1.15.0,!=1.16.0 - - pandas>=0.23.0, !=1.0.0, <2.1.0 + - pandas>=0.23.0,!=1.0.0,<2.1.0 - pyarrow>=0.17.1,!=1.0.0 - simplejson - minimalkv diff --git a/environment.yml b/environment.yml index 6724228f..5b88d959 100644 --- a/environment.yml +++ b/environment.yml @@ -3,7 +3,8 @@ channels: - conda-forge - nodefaults dependencies: - - dask!=2021.5.1,!=2021.6.0, !=2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions + # TODO: Investigate issue with dask 2023.9.2 + - dask!=2021.5.1,!=2021.6.0,<2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions - decorator - msgpack-python>=0.5.2 # Currently dask and numpy==1.16.0 clash diff --git a/setup.cfg b/setup.cfg index 3bedce64..00579a8b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,7 +13,7 @@ classifiers = [options] include_package_data = true install_requires = - dask[dataframe]!=2021.5.1,!=2021.6.0,!=2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions + dask[dataframe]!=2021.5.1,!=2021.6.0,<2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions decorator msgpack>=0.5.2 # Currently dask and numpy==1.16.0 clash From 61feb7490e5b0d8fe5b020fe25c506220a7bb27e Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Tue, 26 Sep 2023 15:35:01 +0100 Subject: [PATCH 31/41] Add no-py-pin to pandas downgrade step --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index abe63090..6fdfa8d3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -132,7 +132,7 @@ jobs: run: micromamba update -c arrow-nightlies -c conda-forge arrow-cpp pyarrow if: matrix.pyarrow == 'nightly' - name: Install Pyarrow (downgrade pandas) - run: micromamba install -y pyarrow==${{ matrix.pyarrow }} pandas==${{ matrix.pandas }} + run: micromamba install -y --no-py-pin pyarrow==${{ matrix.pyarrow }} pandas==${{ matrix.pandas }} if: matrix.pyarrow != 'nightly' && matrix.pandas != '' - name: Pip Install NumFOCUS nightly # NumFOCUS nightly wheels, contains numpy and pandas From ff4793f54e0630311bc90463ee59aba602abac9b Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Tue, 26 Sep 2023 16:26:39 +0100 Subject: [PATCH 32/41] Return to !=2023.9.2 due to broken CI --- docs/environment-docs.yml | 2 +- environment.yml | 2 +- setup.cfg | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/environment-docs.yml b/docs/environment-docs.yml index 21ad1776..91aa41f6 100644 --- a/docs/environment-docs.yml +++ b/docs/environment-docs.yml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: - python>=3.8 - - dask[dataframe]<2023.9.2 + - dask[dataframe]!=2023.9.2 - decorator - msgpack-python>=0.5.2 # Currently dask and numpy==1.16.0 clash diff --git a/environment.yml b/environment.yml index 5b88d959..fd9f04dc 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: - nodefaults dependencies: # TODO: Investigate issue with dask 2023.9.2 - - dask!=2021.5.1,!=2021.6.0,<2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions + - dask!=2021.5.1,!=2021.6.0,!=2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions - decorator - msgpack-python>=0.5.2 # Currently dask and numpy==1.16.0 clash diff --git a/setup.cfg b/setup.cfg index 00579a8b..3bedce64 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,7 +13,7 @@ classifiers = [options] include_package_data = true install_requires = - dask[dataframe]!=2021.5.1,!=2021.6.0,<2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions + dask[dataframe]!=2021.5.1,!=2021.6.0,!=2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions decorator msgpack>=0.5.2 # Currently dask and numpy==1.16.0 clash From f2d4a644b26df5681fdf474236ea286716f02850 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Tue, 26 Sep 2023 16:33:44 +0100 Subject: [PATCH 33/41] Switch CI operation order --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6fdfa8d3..ec436927 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -119,8 +119,6 @@ jobs: cache-env: true extra-specs: | python=${{ matrix.PYTHON_VERSION }} - - name: Install repository - run: python -m pip install --no-build-isolation --no-deps --disable-pip-version-check -e . - name: Install Pyarrow (non-nightly) # Don't pin python as older versions of pyarrow require older versions of python run: micromamba install -y --no-py-pin pyarrow==${{ matrix.pyarrow }} "pandas<2.1.0" @@ -140,6 +138,8 @@ jobs: # TODO: Remove pandas version pin once https://github.com/pandas-dev/pandas/issues/55014 is fixed run: python -m pip install --pre --upgrade --timeout=60 --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple "pandas<2.1.0" if: matrix.numfocus_nightly + - name: Install repository + run: python -m pip install --no-build-isolation --no-deps --disable-pip-version-check -e . - name: Test import run: | python -c "import plateau" From d088f3adafab3eeb1cf3df53274a88ad6148886d Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Tue, 26 Sep 2023 16:46:18 +0100 Subject: [PATCH 34/41] Test whether <2023.9.2 breaks CI --- docs/environment-docs.yml | 2 +- environment.yml | 2 +- setup.cfg | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/environment-docs.yml b/docs/environment-docs.yml index 91aa41f6..21ad1776 100644 --- a/docs/environment-docs.yml +++ b/docs/environment-docs.yml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: - python>=3.8 - - dask[dataframe]!=2023.9.2 + - dask[dataframe]<2023.9.2 - decorator - msgpack-python>=0.5.2 # Currently dask and numpy==1.16.0 clash diff --git a/environment.yml b/environment.yml index fd9f04dc..5b88d959 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: - nodefaults dependencies: # TODO: Investigate issue with dask 2023.9.2 - - dask!=2021.5.1,!=2021.6.0,!=2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions + - dask!=2021.5.1,!=2021.6.0,<2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions - decorator - msgpack-python>=0.5.2 # Currently dask and numpy==1.16.0 clash diff --git a/setup.cfg b/setup.cfg index 3bedce64..00579a8b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,7 +13,7 @@ classifiers = [options] include_package_data = true install_requires = - dask[dataframe]!=2021.5.1,!=2021.6.0,!=2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions + dask[dataframe]!=2021.5.1,!=2021.6.0,<2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions decorator msgpack>=0.5.2 # Currently dask and numpy==1.16.0 clash From db6509212df832e6584534e398570f47ff03ab82 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Wed, 27 Sep 2023 09:08:36 +0100 Subject: [PATCH 35/41] Pin asv<0.6 due to API change --- .github/workflows/ci.yml | 2 +- environment.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ec436927..a063842c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -160,7 +160,7 @@ jobs: - name: Running benchmarks run: | asv --config ./asv_bench/asv.conf.json machine --machine github --os unknown --arch unknown --cpu unknown --ram unknown - asv --config ./asv_bench/asv.conf.json run -E existing:same | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log + asv --config ./asv_bench/asv.conf.json dev | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log if grep "failed" benchmarks.log > /dev/null ; then exit 1 fi diff --git a/environment.yml b/environment.yml index 5b88d959..2773e990 100644 --- a/environment.yml +++ b/environment.yml @@ -37,6 +37,6 @@ dependencies: # CLI - ipython # ASV // Benchmark - - asv + - asv<0.6 # Packaging infrastructure - python-build From 7a7d1b3daa7782ae0c1e229ee044bf9a95f274d2 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Wed, 27 Sep 2023 09:12:29 +0100 Subject: [PATCH 36/41] Pin pyarrow>=4 due to pandas 2 incompatibility --- docs/environment-docs.yml | 2 +- environment.yml | 2 +- setup.cfg | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/environment-docs.yml b/docs/environment-docs.yml index 21ad1776..5fe39c58 100644 --- a/docs/environment-docs.yml +++ b/docs/environment-docs.yml @@ -9,7 +9,7 @@ dependencies: # Currently dask and numpy==1.16.0 clash - numpy!=1.15.0,!=1.16.0 - pandas>=0.23.0,!=1.0.0,<2.1.0 - - pyarrow>=0.17.1,!=1.0.0 + - pyarrow>=4 - simplejson - minimalkv - toolz diff --git a/environment.yml b/environment.yml index 2773e990..99a00260 100644 --- a/environment.yml +++ b/environment.yml @@ -11,7 +11,7 @@ dependencies: # TODO: add support for numpy>=1.23 - numpy!=1.15.0,!=1.16.0 - pandas>=0.23.0,!=1.0.0,<2.1.0 - - pyarrow>=0.17.1,!=1.0.0 + - pyarrow>=4 - simplejson - minimalkv>=1.4.2 - toolz diff --git a/setup.cfg b/setup.cfg index 00579a8b..c3581723 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,7 +19,7 @@ install_requires = # Currently dask and numpy==1.16.0 clash numpy!=1.15.0,!=1.16.0 pandas>=0.23.0,!=1.0.0,<2.1.0 - pyarrow>=0.17.1,!=1.0.0 + pyarrow>=4 simplejson minimalkv>=1.4.2 toolz From 9b627d2b7663d9aeae9f64c84a5a1b5f96601e61 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Wed, 27 Sep 2023 09:33:57 +0100 Subject: [PATCH 37/41] Pin asv during micromamba install --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a063842c..fbbe51bd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -121,7 +121,8 @@ jobs: python=${{ matrix.PYTHON_VERSION }} - name: Install Pyarrow (non-nightly) # Don't pin python as older versions of pyarrow require older versions of python - run: micromamba install -y --no-py-pin pyarrow==${{ matrix.pyarrow }} "pandas<2.1.0" + # Pin asv so it doesn't get updated before the benchmarks are run + run: micromamba install -y --no-py-pin pyarrow==${{ matrix.pyarrow }} "pandas<2.1.0" "asv<0.6" if: matrix.pyarrow != 'nightly' && matrix.pandas == '' - name: Install Pyarrow (nightly) # Install both arrow-cpp and pyarrow to make sure that we have the From 5c1b49feb70c4b2b2f3279fe7a10c6c0eaeae324 Mon Sep 17 00:00:00 2001 From: Izer Onadim <143251429+IzerOnadimQC@users.noreply.github.com> Date: Thu, 28 Sep 2023 10:03:07 +0100 Subject: [PATCH 38/41] Remove square bracket notation from environment-docs.yml Co-authored-by: Jan Tilly --- docs/environment-docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/environment-docs.yml b/docs/environment-docs.yml index 5fe39c58..db25109f 100644 --- a/docs/environment-docs.yml +++ b/docs/environment-docs.yml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: - python>=3.8 - - dask[dataframe]<2023.9.2 + - dask<2023.9.2 - decorator - msgpack-python>=0.5.2 # Currently dask and numpy==1.16.0 clash From 66bae069e8bdb7e20ae9b20ee1a5d0205e62d45e Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Thu, 28 Sep 2023 10:04:22 +0100 Subject: [PATCH 39/41] Remove square bracket notation for dask in setup.cfg --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index c3581723..42f82803 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,7 +13,7 @@ classifiers = [options] include_package_data = true install_requires = - dask[dataframe]!=2021.5.1,!=2021.6.0,<2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions + dask!=2021.5.1,!=2021.6.0,<2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions decorator msgpack>=0.5.2 # Currently dask and numpy==1.16.0 clash From d27900d9b35ecbf7350eb0a545ed81290ce5c900 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Thu, 28 Sep 2023 11:12:35 +0100 Subject: [PATCH 40/41] Refactor PYARROW_LT_13 condition to remove repeated code --- plateau/core/common_metadata.py | 15 +++++------- plateau/core/index.py | 32 ++++++++++--------------- plateau/serialization/_csv.py | 11 ++++----- plateau/serialization/_parquet.py | 39 +++++++++++-------------------- 4 files changed, 37 insertions(+), 60 deletions(-) diff --git a/plateau/core/common_metadata.py b/plateau/core/common_metadata.py index c5433f6c..efc72849 100644 --- a/plateau/core/common_metadata.py +++ b/plateau/core/common_metadata.py @@ -766,15 +766,12 @@ def empty_dataframe_from_schema( # HACK: Cast bytes to object in metadata until Pandas bug is fixed: https://github.com/pandas-dev/pandas/issues/50127 schema = schema_metadata_bytes_to_object(schema.internal()) - if PYARROW_LT_13: - # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist - # as it was introduced for backwards compatibility with pandas 1.x - df = schema.empty_table().to_pandas(date_as_object=date_as_object) - else: - df = schema.empty_table().to_pandas( - date_as_object=date_as_object, - coerce_temporal_nanoseconds=coerce_temporal_nanoseconds, - ) + # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist + # as it was introduced for backwards compatibility with pandas 1.x + _coerce = {} + if not PYARROW_LT_13: + _coerce["coerce_temporal_nanoseconds"] = coerce_temporal_nanoseconds + df = schema.empty_table().to_pandas(date_as_object=date_as_object, **_coerce) df.columns = df.columns.map(ensure_string_type) if columns is not None: diff --git a/plateau/core/index.py b/plateau/core/index.py index d6a79aeb..53aa904a 100644 --- a/plateau/core/index.py +++ b/plateau/core/index.py @@ -148,13 +148,11 @@ def observed_values( # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist # as it was introduced for backwards compatibility with pandas 1.x + _coerce = {} + if not PYARROW_LT_13: + _coerce["coerce_temporal_nanoseconds"] = coerce_temporal_nanoseconds return np.array( - labeled_array.to_pandas(date_as_object=date_as_object) - if PYARROW_LT_13 - else labeled_array.to_pandas( - date_as_object=date_as_object, - coerce_temporal_nanoseconds=coerce_temporal_nanoseconds, - ) + labeled_array.to_pandas(date_as_object=date_as_object, **_coerce) ) @staticmethod @@ -491,14 +489,10 @@ def as_flat_series( table = _index_dct_to_table( self.index_dct, column=self.column, dtype=self.dtype ) - if PYARROW_LT_13: - # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist - # as it was introduced for backwards compatibility with pandas 1.x - df = table.to_pandas(date_as_object=date_as_object) - else: - df = table.to_pandas( - date_as_object=date_as_object, coerce_temporal_nanoseconds=True - ) + # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist + # as it was introduced for backwards compatibility with pandas 1.x + _coerce = {} if PYARROW_LT_13 else {"coerce_temporal_nanoseconds": True} + df = table.to_pandas(date_as_object=date_as_object, **_coerce) if predicates is not None: # If there is a conjunction without any reference to the index @@ -884,12 +878,10 @@ def _parquet_bytes_to_dict(column: str, index_buffer: bytes): if column_type == pa.timestamp("us"): column_type = pa.timestamp("ns") - if PYARROW_LT_13: - # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist - # as it was introduced for backwards compatibility with pandas 1.x - df = table.to_pandas() - else: - df = table.to_pandas(coerce_temporal_nanoseconds=True) + # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist + # as it was introduced for backwards compatibility with pandas 1.x + _coerce = {} if PYARROW_LT_13 else {"coerce_temporal_nanoseconds": True} + df = table.to_pandas(**_coerce) index_dct = dict( zip(df[column].values, (list(x) for x in df[_PARTITION_COLUMN_NAME].values)) diff --git a/plateau/serialization/_csv.py b/plateau/serialization/_csv.py index f8be37ab..ac4926cc 100644 --- a/plateau/serialization/_csv.py +++ b/plateau/serialization/_csv.py @@ -88,12 +88,11 @@ def restore_dataframe( def store(self, store, key_prefix, df): if isinstance(df, pa.Table): - if PYARROW_LT_13: - # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist - # as it was introduced for backwards compatibility with pandas 1.x - df = df.to_pandas() - else: - df = df.to_pandas(coerce_temporal_nanoseconds=True) + # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist + # as it was introduced for backwards compatibility with pandas 1.x + _coerce = {} if PYARROW_LT_13 else {"coerce_temporal_nanoseconds": True} + df = df.to_pandas(**_coerce) + key = f"{key_prefix}.csv" result_stream = BytesIO() iostream: BufferedIOBase diff --git a/plateau/serialization/_parquet.py b/plateau/serialization/_parquet.py index e70b9cee..b7391d27 100644 --- a/plateau/serialization/_parquet.py +++ b/plateau/serialization/_parquet.py @@ -256,24 +256,17 @@ def _restore_dataframe( else: # ARROW-5139 Column projection with empty columns returns a table w/out index if columns == []: + # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist + # as it was introduced for backwards compatibility with pandas 1.x + _coerce = {} + if not PYARROW_LT_13: + _coerce["coerce_temporal_nanoseconds"] = True # Create an arrow table with expected index length. - if PYARROW_LT_13: - # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist - # as it was introduced for backwards compatibility with pandas 1.x - df = ( - parquet_file.schema.to_arrow_schema() - .empty_table() - .to_pandas(date_as_object=date_as_object) - ) - else: - df = ( - parquet_file.schema.to_arrow_schema() - .empty_table() - .to_pandas( - date_as_object=date_as_object, - coerce_temporal_nanoseconds=True, - ) - ) + df = ( + parquet_file.schema.to_arrow_schema() + .empty_table() + .to_pandas(date_as_object=date_as_object, **_coerce) + ) index = pd.Index( pd.RangeIndex(start=0, stop=parquet_file.metadata.num_rows), dtype="int64", @@ -300,14 +293,10 @@ def _restore_dataframe( # HACK: Cast bytes to object in metadata until Pandas bug is fixed: https://github.com/pandas-dev/pandas/issues/50127 table = table.cast(schema_metadata_bytes_to_object(table.schema)) - if PYARROW_LT_13: - # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist - # as it was introduced for backwards compatibility with pandas 1.x - df = table.to_pandas(date_as_object=date_as_object) - else: - df = table.to_pandas( - date_as_object=date_as_object, coerce_temporal_nanoseconds=True - ) + # Prior to pyarrow 13.0.0 coerce_temporal_nanoseconds didn't exist + # as it was introduced for backwards compatibility with pandas 1.x + _coerce = {} if PYARROW_LT_13 else {"coerce_temporal_nanoseconds": True} + df = table.to_pandas(date_as_object=date_as_object, **_coerce) # XXX: Patch until Pyarrow bug is resolved: https://issues.apache.org/jira/browse/ARROW-18099?filter=-2 if categories: From 41509b2008afb5d4bc4b98ec4c77914534ad7aa6 Mon Sep 17 00:00:00 2001 From: Izer Onadim Date: Thu, 28 Sep 2023 11:58:33 +0100 Subject: [PATCH 41/41] Add square brackets back to setup.cfg --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 42f82803..c3581723 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,7 +13,7 @@ classifiers = [options] include_package_data = true install_requires = - dask!=2021.5.1,!=2021.6.0,<2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions + dask[dataframe]!=2021.5.1,!=2021.6.0,<2023.9.2 # gh475 - 2021.5.1 and 2021.6.0 broke ci, omit those versions decorator msgpack>=0.5.2 # Currently dask and numpy==1.16.0 clash