diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 0158df46cc4..d8b4fbbbe4b 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -294,7 +294,9 @@ cpdef write_parquet( cdef unique_ptr[cudf_io_types.data_sink] _data_sink cdef cudf_io_types.sink_info sink = make_sink_info(path, _data_sink) - if index is not False and not isinstance(table._index, cudf.RangeIndex): + if index is True or ( + index is None and not isinstance(table._index, cudf.RangeIndex) + ): tv = table.view() tbl_meta = make_unique[table_input_metadata](tv) for level, idx_name in enumerate(table._index.names): diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 4fe795e57a9..13eedb34c18 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -99,15 +99,31 @@ cpdef generate_pandas_metadata(Table table, index): idx = table.index if isinstance(idx, cudf.core.index.RangeIndex): - descr = { - "kind": "range", - "name": table.index.name, - "start": table.index.start, - "stop": table.index.stop, - "step": table.index.step, - } + if index is None: + descr = { + "kind": "range", + "name": table.index.name, + "start": table.index.start, + "stop": table.index.stop, + "step": table.index.step, + } + else: + # When `index=True`, RangeIndex needs to be materialized. + materialized_idx = cudf.Index(idx._values, name=idx.name) + descr = \ + _index_level_name( + index_name=materialized_idx.name, + level=level, + column_names=col_names + ) + index_levels.append(materialized_idx) else: - descr = _index_level_name(idx.name, level, col_names) + descr = \ + _index_level_name( + index_name=idx.name, + level=level, + column_names=col_names + ) if is_categorical_dtype(idx): raise ValueError( "'category' column dtypes are currently not " diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index a7a11c95e30..fe418d1ade1 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1,4 +1,5 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. + import datetime import math import os @@ -1718,24 +1719,24 @@ def test_parquet_nullable_boolean(tmpdir, engine): ], ) @pytest.mark.parametrize("index", [None, True, False]) -def test_parquet_index(tmpdir, pdf, index): - pandas_path = tmpdir.join("pandas_index.parquet") - cudf_path = tmpdir.join("pandas_index.parquet") +def test_parquet_index(pdf, index): + pandas_buffer = BytesIO() + cudf_buffer = BytesIO() gdf = cudf.from_pandas(pdf) - pdf.to_parquet(pandas_path, index=index) - gdf.to_parquet(cudf_path, index=index) + pdf.to_parquet(pandas_buffer, index=index) + gdf.to_parquet(cudf_buffer, index=index) - expected = pd.read_parquet(cudf_path) - actual = cudf.read_parquet(cudf_path) + expected = pd.read_parquet(cudf_buffer) + actual = cudf.read_parquet(pandas_buffer) - assert_eq(expected, actual) + assert_eq(expected, actual, check_index_type=True) - expected = pd.read_parquet(pandas_path) - actual = cudf.read_parquet(pandas_path) + expected = pd.read_parquet(pandas_buffer) + actual = cudf.read_parquet(cudf_buffer) - assert_eq(expected, actual) + assert_eq(expected, actual, check_index_type=True) @pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 5d52d6c7da4..16511627aa2 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. import datetime import os @@ -193,7 +193,10 @@ index : bool, default None If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. If ``None``, the - engine's default behavior will be used. + engine's default behavior will be used. However, instead of being saved + as values, the ``RangeIndex`` will be stored as a range in the metadata + so it doesn’t require much space and is faster. Other indexes will + be included as columns in the file output. partition_cols : list, optional, default None Column names by which to partition the dataset Columns are partitioned in the order they are given