Materialize RangeIndex when index=True in parquet writer (#7711)

Resolves: #6873 This PR enables support to materialize a `RangeIndex` when `index=True`. Didn't add any tests as we already test for this in `test_parquet_index` but we were having the tests wrong due to a typo which was writing to the same file both pandas & cudf dataframes. This test is now fixed in this PR. Authors: - GALI PREM SAGAR (@galipremsagar) Approvers: - Keith Kraus (@kkraus14) URL: #7711
rapidsai · Mar 24, 2021 · 1417297 · 1417297
1 parent df3c0f0
commit 1417297
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 23 deletions.
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
@@ -294,7 +294,9 @@ cpdef write_parquet(
     cdef unique_ptr[cudf_io_types.data_sink] _data_sink
     cdef cudf_io_types.sink_info sink = make_sink_info(path, _data_sink)
 
-    if index is not False and not isinstance(table._index, cudf.RangeIndex):
+    if index is True or (
+        index is None and not isinstance(table._index, cudf.RangeIndex)
+    ):
         tv = table.view()
         tbl_meta = make_unique[table_input_metadata](tv)
         for level, idx_name in enumerate(table._index.names):

diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
@@ -99,15 +99,31 @@ cpdef generate_pandas_metadata(Table table, index):
                 idx = table.index
 
             if isinstance(idx, cudf.core.index.RangeIndex):
-                descr = {
-                    "kind": "range",
-                    "name": table.index.name,
-                    "start": table.index.start,
-                    "stop": table.index.stop,
-                    "step": table.index.step,
-                }
+                if index is None:
+                    descr = {
+                        "kind": "range",
+                        "name": table.index.name,
+                        "start": table.index.start,
+                        "stop": table.index.stop,
+                        "step": table.index.step,
+                    }
+                else:
+                    # When `index=True`, RangeIndex needs to be materialized.
+                    materialized_idx = cudf.Index(idx._values, name=idx.name)
+                    descr = \
+                        _index_level_name(
+                            index_name=materialized_idx.name,
+                            level=level,
+                            column_names=col_names
+                        )
+                    index_levels.append(materialized_idx)
             else:
-                descr = _index_level_name(idx.name, level, col_names)
+                descr = \
+                    _index_level_name(
+                        index_name=idx.name,
+                        level=level,
+                        column_names=col_names
+                    )
                 if is_categorical_dtype(idx):
                     raise ValueError(
                         "'category' column dtypes are currently not "

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+
 import datetime
 import math
 import os
@@ -1718,24 +1719,24 @@ def test_parquet_nullable_boolean(tmpdir, engine):
     ],
 )
 @pytest.mark.parametrize("index", [None, True, False])
-def test_parquet_index(tmpdir, pdf, index):
-    pandas_path = tmpdir.join("pandas_index.parquet")
-    cudf_path = tmpdir.join("pandas_index.parquet")
+def test_parquet_index(pdf, index):
+    pandas_buffer = BytesIO()
+    cudf_buffer = BytesIO()
 
     gdf = cudf.from_pandas(pdf)
 
-    pdf.to_parquet(pandas_path, index=index)
-    gdf.to_parquet(cudf_path, index=index)
+    pdf.to_parquet(pandas_buffer, index=index)
+    gdf.to_parquet(cudf_buffer, index=index)
 
-    expected = pd.read_parquet(cudf_path)
-    actual = cudf.read_parquet(cudf_path)
+    expected = pd.read_parquet(cudf_buffer)
+    actual = cudf.read_parquet(pandas_buffer)
 
-    assert_eq(expected, actual)
+    assert_eq(expected, actual, check_index_type=True)
 
-    expected = pd.read_parquet(pandas_path)
-    actual = cudf.read_parquet(pandas_path)
+    expected = pd.read_parquet(pandas_buffer)
+    actual = cudf.read_parquet(cudf_buffer)
 
-    assert_eq(expected, actual)
+    assert_eq(expected, actual, check_index_type=True)
 
 
 @pytest.mark.parametrize("engine", ["cudf", "pyarrow"])

diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 
 import datetime
 import os
@@ -193,7 +193,10 @@
 index : bool, default None
     If ``True``, include the dataframe's index(es) in the file output. If
     ``False``, they will not be written to the file. If ``None``, the
-    engine's default behavior will be used.
+    engine's default behavior will be used. However, instead of being saved
+    as values, the ``RangeIndex`` will be stored as a range in the metadata
+    so it doesn’t require much space and is faster. Other indexes will
+    be included as columns in the file output.
 partition_cols : list, optional, default None
     Column names by which to partition the dataset
     Columns are partitioned in the order they are given