Add control of Parquet column index creation to python (#11453)

#11302 added `STATISTICS_COLUMN` to the `statistics_freq` enum in libcudf. This adds the same to python. Authors: - Ed Seidl (https://github.com/etseidl) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ashwin Srinath (https://github.com/shwina) - Vyas Ramasubramani (https://github.com/vyasr) URL: #11453
rapidsai · Sep 1, 2022 · c273da4 · c273da4
1 parent 7857a30
commit c273da4
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 3 deletions.
diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd
@@ -48,6 +48,7 @@ cdef extern from "cudf/io/types.hpp" \
         STATISTICS_NONE = 0,
         STATISTICS_ROWGROUP = 1,
         STATISTICS_PAGE = 2,
+        STATISTICS_COLUMN = 3,
 
     cdef cppclass column_name_info:
         string name

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
@@ -454,7 +454,7 @@ cdef class ParquetWriter:
         index(es) other than RangeIndex will be saved as columns.
     compression : {'snappy', None}, default 'snappy'
         Name of the compression to use. Use ``None`` for no compression.
-    statistics : {'ROWGROUP', 'PAGE', 'NONE'}, default 'ROWGROUP'
+    statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
         Level at which column statistics should be included in file.
     row_group_size_bytes: int, default 134217728
         Maximum size of each stripe of the output.
@@ -659,6 +659,8 @@ cdef cudf_io_types.statistics_freq _get_stat_freq(object statistics):
         return cudf_io_types.statistics_freq.STATISTICS_ROWGROUP
     elif statistics == "PAGE":
         return cudf_io_types.statistics_freq.STATISTICS_PAGE
+    elif statistics == "COLUMN":
+        return cudf_io_types.statistics_freq.STATISTICS_COLUMN
     else:
         raise ValueError("Unsupported `statistics_freq` type")
 

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
@@ -861,7 +861,7 @@ class ParquetDatasetWriter:
         index(es) other than RangeIndex will be saved as columns.
     compression : {'snappy', None}, default 'snappy'
         Name of the compression to use. Use ``None`` for no compression.
-    statistics : {'ROWGROUP', 'PAGE', 'NONE'}, default 'ROWGROUP'
+    statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
         Level at which column statistics should be included in file.
     max_file_size : int or str, default None
         A file size that cannot be exceeded by the writer.

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -1564,6 +1564,27 @@ def test_parquet_writer_row_group_size(tmpdir, row_group_size_kwargs):
     assert_eq(cudf.read_parquet(fname), gdf)
 
 
+def test_parquet_writer_column_index(tmpdir):
+    # Simple test for presence of indices. validity is checked
+    # in libcudf tests.
+    # Write 2 files, one with column index set, one without.
+    # Make sure the former is larger in size.
+
+    size = 20000
+    gdf = cudf.DataFrame({"a": range(size), "b": [1] * size})
+
+    fname = tmpdir.join("gdf.parquet")
+    with ParquetWriter(fname, statistics="ROWGROUP") as writer:
+        writer.write_table(gdf)
+    s1 = os.path.getsize(fname)
+
+    fname = tmpdir.join("gdfi.parquet")
+    with ParquetWriter(fname, statistics="COLUMN") as writer:
+        writer.write_table(gdf)
+    s2 = os.path.getsize(fname)
+    assert s2 > s1
+
+
 @pytest.mark.parametrize(
     "max_page_size_kwargs",
     [

diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
@@ -228,7 +228,7 @@
 partition_offsets : list, optional, default None
     Offsets to partition the dataframe by. Should be used when path is list
     of str. Should be a list of integers of size ``len(path) + 1``
-statistics : {'ROWGROUP', 'PAGE', 'NONE'}, default 'ROWGROUP'
+statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
     Level at which column statistics should be included in file.
 metadata_file_path : str, optional, default None
     If specified, this function will return a binary blob containing the footer