From dd53c60a1d316ad3ec6c2f62699dc8aa70f1bdf1 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 22 Mar 2024 12:35:25 +0000 Subject: [PATCH] Generate correct metadata in to_arrow with preserve_index=True When preserving the index and we have a RangeIndex, we must materialize it, and write that information in the metadata correctly. - Closes #14159 --- python/cudf/cudf/core/dataframe.py | 43 +++++++++++++++++------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index da0a969b70c..2a4f93c1716 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5485,14 +5485,18 @@ def from_arrow(cls, table): return out @_cudf_nvtx_annotate - def to_arrow(self, preserve_index=True): + def to_arrow(self, preserve_index=None): """ Convert to a PyArrow Table. Parameters ---------- - preserve_index : bool, default True - whether index column and its meta data needs to be saved or not + preserve_index : bool, optional + whether index column and its meta data needs to be saved + or not. The default of None will store the index as a + column, except for a RangeIndex which is stored as + metadata only. Setting preserve_index to True will force + a RangeIndex to be materialized. Returns ------- @@ -5523,34 +5527,35 @@ def to_arrow(self, preserve_index=True): data = self.copy(deep=False) index_descr = [] - if preserve_index: - if isinstance(self.index, cudf.RangeIndex): + write_index = preserve_index is not False + keep_range_index = write_index and preserve_index is None + index = self.index + if write_index: + if isinstance(index, cudf.RangeIndex) and keep_range_index: descr = { "kind": "range", - "name": self.index.name, - "start": self.index._start, - "stop": self.index._stop, + "name": index.name, + "start": index._start, + "stop": index._stop, "step": 1, } else: - if isinstance(self.index, MultiIndex): + if isinstance(index, cudf.RangeIndex): + index = index._as_int_index() + index.name = "__index_level_0__" + if isinstance(index, MultiIndex): gen_names = tuple( - f"level_{i}" - for i, _ in enumerate(self.index._data.names) + f"level_{i}" for i, _ in enumerate(index._data.names) ) else: gen_names = ( - self.index.names - if self.index.name is not None - else ("index",) + index.names if index.name is not None else ("index",) ) - for gen_name, col_name in zip( - gen_names, self.index._data.names - ): + for gen_name, col_name in zip(gen_names, index._data.names): data._insert( data.shape[1], gen_name, - self.index._data[col_name], + index._data[col_name], ) descr = gen_names[0] index_descr.append(descr) @@ -5560,7 +5565,7 @@ def to_arrow(self, preserve_index=True): columns_to_convert=[self[col] for col in self._data.names], df=self, column_names=out.schema.names, - index_levels=[self.index], + index_levels=[index], index_descriptors=index_descr, preserve_index=preserve_index, types=out.schema.types,