Skip to content

Commit

Permalink
Add control of Parquet column index creation to python (#11453)
Browse files Browse the repository at this point in the history
#11302 added `STATISTICS_COLUMN` to the `statistics_freq` enum in libcudf.  This adds the same to python.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #11453
  • Loading branch information
etseidl authored Sep 1, 2022
1 parent 7857a30 commit c273da4
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 3 deletions.
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/cpp/io/types.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ cdef extern from "cudf/io/types.hpp" \
STATISTICS_NONE = 0,
STATISTICS_ROWGROUP = 1,
STATISTICS_PAGE = 2,
STATISTICS_COLUMN = 3,

cdef cppclass column_name_info:
string name
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,7 @@ cdef class ParquetWriter:
index(es) other than RangeIndex will be saved as columns.
compression : {'snappy', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
statistics : {'ROWGROUP', 'PAGE', 'NONE'}, default 'ROWGROUP'
statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
Level at which column statistics should be included in file.
row_group_size_bytes: int, default 134217728
Maximum size of each stripe of the output.
Expand Down Expand Up @@ -659,6 +659,8 @@ cdef cudf_io_types.statistics_freq _get_stat_freq(object statistics):
return cudf_io_types.statistics_freq.STATISTICS_ROWGROUP
elif statistics == "PAGE":
return cudf_io_types.statistics_freq.STATISTICS_PAGE
elif statistics == "COLUMN":
return cudf_io_types.statistics_freq.STATISTICS_COLUMN
else:
raise ValueError("Unsupported `statistics_freq` type")

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -861,7 +861,7 @@ class ParquetDatasetWriter:
index(es) other than RangeIndex will be saved as columns.
compression : {'snappy', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
statistics : {'ROWGROUP', 'PAGE', 'NONE'}, default 'ROWGROUP'
statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
Level at which column statistics should be included in file.
max_file_size : int or str, default None
A file size that cannot be exceeded by the writer.
Expand Down
21 changes: 21 additions & 0 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1564,6 +1564,27 @@ def test_parquet_writer_row_group_size(tmpdir, row_group_size_kwargs):
assert_eq(cudf.read_parquet(fname), gdf)


def test_parquet_writer_column_index(tmpdir):
# Simple test for presence of indices. validity is checked
# in libcudf tests.
# Write 2 files, one with column index set, one without.
# Make sure the former is larger in size.

size = 20000
gdf = cudf.DataFrame({"a": range(size), "b": [1] * size})

fname = tmpdir.join("gdf.parquet")
with ParquetWriter(fname, statistics="ROWGROUP") as writer:
writer.write_table(gdf)
s1 = os.path.getsize(fname)

fname = tmpdir.join("gdfi.parquet")
with ParquetWriter(fname, statistics="COLUMN") as writer:
writer.write_table(gdf)
s2 = os.path.getsize(fname)
assert s2 > s1


@pytest.mark.parametrize(
"max_page_size_kwargs",
[
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@
partition_offsets : list, optional, default None
Offsets to partition the dataframe by. Should be used when path is list
of str. Should be a list of integers of size ``len(path) + 1``
statistics : {'ROWGROUP', 'PAGE', 'NONE'}, default 'ROWGROUP'
statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
Level at which column statistics should be included in file.
metadata_file_path : str, optional, default None
If specified, this function will return a binary blob containing the footer
Expand Down

0 comments on commit c273da4

Please sign in to comment.