Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change the default dictionary policy in Parquet writer from ALWAYS to ADAPTIVE #15570

Merged
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpp/include/cudf/io/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ class parquet_writer_options {
// Maximum size of min or max values in column index
int32_t _column_index_truncate_length = default_column_index_truncate_length;
// When to use dictionary encoding for data
dictionary_policy _dictionary_policy = dictionary_policy::ALWAYS;
mhaseeb123 marked this conversation as resolved.
Show resolved Hide resolved
dictionary_policy _dictionary_policy = dictionary_policy::ADAPTIVE;
mhaseeb123 marked this conversation as resolved.
Show resolved Hide resolved
// Maximum size of column chunk dictionary (in bytes)
size_t _max_dictionary_size = default_max_dictionary_size;
// Maximum number of rows in a page fragment
Expand Down Expand Up @@ -1095,7 +1095,7 @@ class parquet_writer_options_builder {
* dictionary_policy::ALWAYS will allow the use of dictionary encoding even if it will result in
* the disabling of compression for columns that would otherwise be compressed.
*
* The default value is dictionary_policy::ALWAYS.
* The default value is dictionary_policy::ADAPTIVE.
*
* @param val policy for dictionary use
* @return this for chaining
Expand Down
5 changes: 5 additions & 0 deletions python/cudf/cudf/_lib/cpp/io/parquet.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
size_type get_row_group_size_rows() except +
size_t get_max_page_size_bytes() except +
size_type get_max_page_size_rows() except +
size_t get_max_dictionary_size() except+

void set_partitions(
vector[cudf_io_types.partition_info] partitions
Expand Down Expand Up @@ -103,6 +104,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
void set_row_group_size_rows(size_type val) except +
void set_max_page_size_bytes(size_t val) except +
void set_max_page_size_rows(size_type val) except +
void set_max_dictionary_size(size_t val) except +
void enable_write_v2_headers(bool val) except +
void set_dictionary_policy(cudf_io_types.dictionary_policy policy)except +

Expand Down Expand Up @@ -155,6 +157,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
parquet_writer_options_builder& max_page_size_rows(
size_type val
) except +
parquet_writer_options_builder& max_dictionary_page_size(
size_t val
) except +
parquet_writer_options_builder& write_v2_headers(
bool val
) except +
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/_lib/parquet.pyx
vuule marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,7 @@ def write_parquet(
object row_group_size_rows=None,
object max_page_size_bytes=None,
object max_page_size_rows=None,
object max_dictionary_size=None,
object partitions_info=None,
object force_nullable_schema=False,
header_version="1.0",
Expand Down Expand Up @@ -478,7 +479,7 @@ def write_parquet(
)

dict_policy = (
cudf_io_types.dictionary_policy.ALWAYS
cudf_io_types.dictionary_policy.ADAPTIVE
if use_dictionary
vuule marked this conversation as resolved.
Show resolved Hide resolved
else cudf_io_types.dictionary_policy.NEVER
)
Expand Down Expand Up @@ -528,6 +529,8 @@ def write_parquet(
args.set_max_page_size_bytes(max_page_size_bytes)
if max_page_size_rows is not None:
args.set_max_page_size_rows(max_page_size_rows)
if max_dictionary_size is not None:
args.set_max_dictionary_size(max_dictionary_size)

with nogil:
out_metadata_c = move(parquet_writer(args))
Expand Down
4 changes: 4 additions & 0 deletions python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def _write_parquet(
row_group_size_rows=None,
max_page_size_bytes=None,
max_page_size_rows=None,
max_dictionary_size=None,
partitions_info=None,
storage_options=None,
force_nullable_schema=False,
Expand Down Expand Up @@ -96,6 +97,7 @@ def _write_parquet(
"row_group_size_rows": row_group_size_rows,
"max_page_size_bytes": max_page_size_bytes,
"max_page_size_rows": max_page_size_rows,
"max_dictionary_size": max_dictionary_size,
"partitions_info": partitions_info,
"force_nullable_schema": force_nullable_schema,
"header_version": header_version,
Expand Down Expand Up @@ -898,6 +900,7 @@ def to_parquet(
row_group_size_rows=None,
max_page_size_bytes=None,
max_page_size_rows=None,
max_dictionary_size=None,
storage_options=None,
return_metadata=False,
force_nullable_schema=False,
Expand Down Expand Up @@ -974,6 +977,7 @@ def to_parquet(
row_group_size_rows=row_group_size_rows,
max_page_size_bytes=max_page_size_bytes,
max_page_size_rows=max_page_size_rows,
max_dictionary_size=max_dictionary_size,
partitions_info=partition_info,
storage_options=storage_options,
force_nullable_schema=force_nullable_schema,
Expand Down
8 changes: 6 additions & 2 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,10 @@
max_page_size_rows: integer or None, default None
Maximum number of rows of each page of the output.
If None, 20000 will be used.
max_dictionary_size: integer or None. default None
vuule marked this conversation as resolved.
Show resolved Hide resolved
Maximum size of dictionary page for each output column chunks. Dictionary
mhaseeb123 marked this conversation as resolved.
Show resolved Hide resolved
encoding for column chunks that exceeds this limit will be disabled.
If None, 1048576 (1MB) will be used.
storage_options : dict, optional, default None
Extra options that make sense for a particular storage connection,
e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
Expand All @@ -291,8 +295,8 @@
``return_metadata=True`` instead of specifying ``metadata_file_path``
use_dictionary : bool, default True
When ``False``, prevents the use of dictionary encoding for Parquet page
data. When ``True``, dictionary encoding is preferred when not disabled due
to dictionary size constraints.
data. When ``True``, dictionary encoding is preferred subjected to
mhaseeb123 marked this conversation as resolved.
Show resolved Hide resolved
``max_dictionary_size`` constraints.
header_version : {{'1.0', '2.0'}}, default "1.0"
Controls whether to use version 1.0 or version 2.0 page headers when
encoding. Version 1.0 is more portable, but version 2.0 enables the
Expand Down
Loading