Skip to content

Commit

Permalink
fix index materialization issue in parquet writer
Browse files Browse the repository at this point in the history
  • Loading branch information
galipremsagar committed Mar 24, 2021
1 parent 267d29b commit ce784bb
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 23 deletions.
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,9 @@ cpdef write_parquet(
cdef unique_ptr[cudf_io_types.data_sink] _data_sink
cdef cudf_io_types.sink_info sink = make_sink_info(path, _data_sink)

if index is not False and not isinstance(table._index, cudf.RangeIndex):
if index is True or (
index is None and not isinstance(table._index, cudf.RangeIndex)
):
tv = table.view()
tbl_meta = make_unique[table_input_metadata](tv)
for level, idx_name in enumerate(table._index.names):
Expand Down
32 changes: 24 additions & 8 deletions python/cudf/cudf/_lib/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -99,15 +99,31 @@ cpdef generate_pandas_metadata(Table table, index):
idx = table.index

if isinstance(idx, cudf.core.index.RangeIndex):
descr = {
"kind": "range",
"name": table.index.name,
"start": table.index.start,
"stop": table.index.stop,
"step": table.index.step,
}
if index is None:
descr = {
"kind": "range",
"name": table.index.name,
"start": table.index.start,
"stop": table.index.stop,
"step": table.index.step,
}
else:
# When `index=True`, RangeIndex needs to be materialized.
materialized_idx = cudf.Index(idx._values, name=idx.name)
descr = \
_index_level_name(
index_name=materialized_idx.name,
level=level,
column_names=col_names
)
index_levels.append(materialized_idx)
else:
descr = _index_level_name(idx.name, level, col_names)
descr = \
_index_level_name(
index_name=idx.name,
level=level,
column_names=col_names
)
if is_categorical_dtype(idx):
raise ValueError(
"'category' column dtypes are currently not "
Expand Down
25 changes: 13 additions & 12 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2019-2020, NVIDIA CORPORATION.
# Copyright (c) 2019-2021, NVIDIA CORPORATION.

import datetime
import math
import os
Expand Down Expand Up @@ -1718,24 +1719,24 @@ def test_parquet_nullable_boolean(tmpdir, engine):
],
)
@pytest.mark.parametrize("index", [None, True, False])
def test_parquet_index(tmpdir, pdf, index):
pandas_path = tmpdir.join("pandas_index.parquet")
cudf_path = tmpdir.join("pandas_index.parquet")
def test_parquet_index(pdf, index):
pandas_buffer = BytesIO()
cudf_buffer = BytesIO()

gdf = cudf.from_pandas(pdf)

pdf.to_parquet(pandas_path, index=index)
gdf.to_parquet(cudf_path, index=index)
pdf.to_parquet(pandas_buffer, index=index)
gdf.to_parquet(cudf_buffer, index=index)

expected = pd.read_parquet(cudf_path)
actual = cudf.read_parquet(cudf_path)
expected = pd.read_parquet(cudf_buffer)
actual = cudf.read_parquet(pandas_buffer)

assert_eq(expected, actual)
assert_eq(expected, actual, check_index_type=True)

expected = pd.read_parquet(pandas_path)
actual = cudf.read_parquet(pandas_path)
expected = pd.read_parquet(pandas_buffer)
actual = cudf.read_parquet(cudf_buffer)

assert_eq(expected, actual)
assert_eq(expected, actual, check_index_type=True)


@pytest.mark.parametrize("engine", ["cudf", "pyarrow"])
Expand Down
7 changes: 5 additions & 2 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2020, NVIDIA CORPORATION.
# Copyright (c) 2019-2021, NVIDIA CORPORATION.

import datetime
import os
Expand Down Expand Up @@ -193,7 +193,10 @@
index : bool, default None
If ``True``, include the dataframe's index(es) in the file output. If
``False``, they will not be written to the file. If ``None``, the
engine's default behavior will be used.
engine's default behavior will be used. However, instead of being saved
as values, the ``RangeIndex`` will be stored as a range in the metadata
so it doesn’t require much space and is faster. Other indexes will
be included as columns in the file output.
partition_cols : list, optional, default None
Column names by which to partition the dataset
Columns are partitioned in the order they are given
Expand Down

0 comments on commit ce784bb

Please sign in to comment.