Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Control Parquet page size through Python API #11454

Merged
merged 20 commits into from
Aug 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
4efe9c8
add page size controls to python
etseidl Jun 20, 2022
ff964d9
Merge branch 'rapidsai:branch-22.08' into feature/python_page_size
etseidl Jun 21, 2022
25b3c0f
Merge branch 'rapidsai:branch-22.08' into feature/python_page_size
etseidl Jun 21, 2022
d351186
Merge branch 'rapidsai:branch-22.08' into feature/python_page_size
etseidl Jun 22, 2022
f6c2401
Merge branch 'rapidsai:branch-22.08' into feature/python_page_size
etseidl Jun 22, 2022
725bda6
Merge branch 'rapidsai:branch-22.08' into feature/python_page_size
etseidl Jun 22, 2022
40344c4
add test of max_page_size params
etseidl Jun 22, 2022
f2ca83a
Merge branch 'rapidsai:branch-22.08' into feature/python_page_size
etseidl Jun 22, 2022
086c9e9
Merge branch 'rapidsai:branch-22.08' into feature/python_page_size
etseidl Jun 23, 2022
6082987
Merge branch 'rapidsai:branch-22.08' into feature/python_page_size
etseidl Jun 23, 2022
3462a92
Merge branch 'rapidsai:branch-22.08' into feature/python_page_size
etseidl Jun 24, 2022
811fd42
Merge branch 'rapidsai:branch-22.08' into feature/python_page_size
etseidl Jun 25, 2022
19e94d2
Merge branch 'rapidsai:branch-22.08' into feature/python_page_size
etseidl Jun 27, 2022
5a03dd1
Merge branch 'rapidsai:branch-22.08' into feature/python_page_size
etseidl Jul 13, 2022
c5e8ab0
Merge branch 'rapidsai:branch-22.08' into feature/python_page_size
etseidl Jul 13, 2022
12984ab
Merge branch 'rapidsai:branch-22.08' into feature/python_page_size
etseidl Jul 22, 2022
6a63b32
Merge branch 'rapidsai:branch-22.08' into feature/python_page_size
etseidl Jul 29, 2022
7510c5c
Merge branch 'rapidsai:branch-22.08' into feature/python_page_size
etseidl Aug 1, 2022
a41bee5
Merge branch 'rapidsai:branch-22.10' into feature/python_page_size
etseidl Aug 3, 2022
dbbc157
Merge branch 'rapidsai:branch-22.10' into feature/python_page_size
etseidl Aug 3, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions python/cudf/cudf/_lib/cpp/io/parquet.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
string get_column_chunks_file_paths() except+
size_t get_row_group_size_bytes() except+
size_type get_row_group_size_rows() except+
size_t get_max_page_size_bytes() except+
size_type get_max_page_size_rows() except+

void set_partitions(
vector[cudf_io_types.partition_info] partitions
Expand All @@ -96,6 +98,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
) except +
void set_row_group_size_bytes(size_t val) except+
void set_row_group_size_rows(size_type val) except+
void set_max_page_size_bytes(size_t val) except+
void set_max_page_size_rows(size_type val) except+

@staticmethod
parquet_writer_options_builder builder(
Expand Down Expand Up @@ -137,6 +141,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
parquet_writer_options_builder& row_group_size_rows(
size_type val
) except+
parquet_writer_options_builder& max_page_size_bytes(
size_t val
) except+
parquet_writer_options_builder& max_page_size_rows(
size_type val
) except+

parquet_writer_options build() except +

Expand All @@ -153,6 +163,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
) except+
size_t get_row_group_size_bytes() except+
size_type get_row_group_size_rows() except+
size_t get_max_page_size_bytes() except+
size_type get_max_page_size_rows() except+

void set_metadata(
cudf_io_types.table_input_metadata *m
Expand All @@ -168,6 +180,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
) except +
void set_row_group_size_bytes(size_t val) except+
void set_row_group_size_rows(size_type val) except+
void set_max_page_size_bytes(size_t val) except+
void set_max_page_size_rows(size_type val) except+

@staticmethod
chunked_parquet_writer_options_builder builder(
Expand Down Expand Up @@ -197,6 +211,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
chunked_parquet_writer_options_builder& row_group_size_rows(
size_type val
) except+
chunked_parquet_writer_options_builder& max_page_size_bytes(
size_t val
) except+
chunked_parquet_writer_options_builder& max_page_size_rows(
size_type val
) except+

chunked_parquet_writer_options build() except +

Expand Down
22 changes: 21 additions & 1 deletion python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,8 @@ cpdef write_parquet(
object int96_timestamps=False,
object row_group_size_bytes=None,
object row_group_size_rows=None,
object max_page_size_bytes=None,
object max_page_size_rows=None,
object partitions_info=None):
"""
Cython function to call into libcudf API, see `write_parquet`.
Expand Down Expand Up @@ -426,6 +428,10 @@ cpdef write_parquet(
args.set_row_group_size_bytes(row_group_size_bytes)
if row_group_size_rows is not None:
args.set_row_group_size_rows(row_group_size_rows)
if max_page_size_bytes is not None:
args.set_max_page_size_bytes(max_page_size_bytes)
if max_page_size_rows is not None:
args.set_max_page_size_rows(max_page_size_rows)

with nogil:
out_metadata_c = move(parquet_writer(args))
Expand Down Expand Up @@ -463,6 +469,12 @@ cdef class ParquetWriter:
row_group_size_rows: int, default 1000000
Maximum number of rows of each stripe of the output.
By default, 1000000 (10^6 rows) will be used.
max_page_size_bytes: int, default 524288
Maximum uncompressed size of each page of the output.
By default, 524288 (512KB) will be used.
max_page_size_rows: int, default 20000
Maximum number of rows of each page of the output.
By default, 20000 will be used.

See Also
--------
Expand All @@ -478,11 +490,15 @@ cdef class ParquetWriter:
cdef object index
cdef size_t row_group_size_bytes
cdef size_type row_group_size_rows
cdef size_t max_page_size_bytes
cdef size_type max_page_size_rows

def __cinit__(self, object filepath_or_buffer, object index=None,
object compression=None, str statistics="ROWGROUP",
int row_group_size_bytes=134217728,
int row_group_size_rows=1000000):
int row_group_size_rows=1000000,
int max_page_size_bytes=524288,
int max_page_size_rows=20000):
filepaths_or_buffers = (
list(filepath_or_buffer)
if is_list_like(filepath_or_buffer)
Expand All @@ -495,6 +511,8 @@ cdef class ParquetWriter:
self.initialized = False
self.row_group_size_bytes = row_group_size_bytes
self.row_group_size_rows = row_group_size_rows
self.max_page_size_bytes = max_page_size_bytes
self.max_page_size_rows = max_page_size_rows

def write_table(self, table, object partitions_info=None):
""" Writes a single table to the file """
Expand Down Expand Up @@ -609,6 +627,8 @@ cdef class ParquetWriter:
.stats_level(self.stat_freq)
.row_group_size_bytes(self.row_group_size_bytes)
.row_group_size_rows(self.row_group_size_rows)
.max_page_size_bytes(self.max_page_size_bytes)
.max_page_size_rows(self.max_page_size_rows)
.build()
)
self.writer.reset(new cpp_parquet_chunked_writer(args))
Expand Down
10 changes: 10 additions & 0 deletions python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ def _write_parquet(
int96_timestamps=False,
row_group_size_bytes=None,
row_group_size_rows=None,
max_page_size_bytes=None,
max_page_size_rows=None,
partitions_info=None,
**kwargs,
):
Expand All @@ -82,6 +84,8 @@ def _write_parquet(
"int96_timestamps": int96_timestamps,
"row_group_size_bytes": row_group_size_bytes,
"row_group_size_rows": row_group_size_rows,
"max_page_size_bytes": max_page_size_bytes,
"max_page_size_rows": max_page_size_rows,
"partitions_info": partitions_info,
}
if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs):
Expand Down Expand Up @@ -618,6 +622,8 @@ def to_parquet(
int96_timestamps=False,
row_group_size_bytes=None,
row_group_size_rows=None,
max_page_size_bytes=None,
max_page_size_rows=None,
*args,
**kwargs,
):
Expand Down Expand Up @@ -647,6 +653,8 @@ def to_parquet(
"int96_timestamps": int96_timestamps,
"row_group_size_bytes": row_group_size_bytes,
"row_group_size_rows": row_group_size_rows,
"max_page_size_bytes": max_page_size_bytes,
"max_page_size_rows": max_page_size_rows,
}
)
return write_to_dataset(
Expand Down Expand Up @@ -676,6 +684,8 @@ def to_parquet(
int96_timestamps=int96_timestamps,
row_group_size_bytes=row_group_size_bytes,
row_group_size_rows=row_group_size_rows,
max_page_size_bytes=max_page_size_bytes,
max_page_size_rows=max_page_size_rows,
**kwargs,
)

Expand Down
31 changes: 31 additions & 0 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1673,6 +1673,37 @@ def test_parquet_writer_row_group_size(tmpdir, row_group_size_kwargs):
assert_eq(cudf.read_parquet(fname), gdf)


@pytest.mark.parametrize(
"max_page_size_kwargs",
[
{"max_page_size_bytes": 4 * 1024},
{"max_page_size_rows": 5000},
],
)
def test_parquet_writer_max_page_size(tmpdir, max_page_size_kwargs):
# Check that max_page_size options are exposed in Python
# Since we don't have access to page metadata, instead check that
# file written with more pages will be slightly larger

size = 20000
gdf = cudf.DataFrame({"a": range(size), "b": [1] * size})

fname = tmpdir.join("gdf.parquet")
with ParquetWriter(fname, **max_page_size_kwargs) as writer:
writer.write_table(gdf)
s1 = os.path.getsize(fname)

assert_eq(cudf.read_parquet(fname), gdf)

fname = tmpdir.join("gdf0.parquet")
with ParquetWriter(fname) as writer:
writer.write_table(gdf)
s2 = os.path.getsize(fname)

assert_eq(cudf.read_parquet(fname), gdf)
assert s1 > s2


@pytest.mark.parametrize("filename", ["myfile.parquet", None])
@pytest.mark.parametrize("cols", [["b"], ["c", "b"]])
def test_parquet_partitioned(tmpdir_factory, cols, filename):
Expand Down
6 changes: 6 additions & 0 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,12 @@
row_group_size_rows: integer or None, default None
Maximum number of rows of each stripe of the output.
If None, 1000000 will be used.
max_page_size_bytes: integer or None, default None
Maximum uncompressed size of each page of the output.
If None, 524288 (512KB) will be used.
max_page_size_rows: integer or None, default None
Maximum number of rows of each page of the output.
If None, 20000 will be used.
**kwargs
To request metadata binary blob when using with ``partition_cols``, Pass
``return_metadata=True`` instead of specifying ``metadata_file_path``
Expand Down