From 7ef83a6ac1614a2527c5c2490f2c3e8dfd4224f1 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 19 Apr 2024 00:08:12 +0000 Subject: [PATCH 01/14] Change the default dict policy from ALWAYS to ADAPTIVE --- cpp/include/cudf/io/parquet.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index f58bc48a37d..60ad86dcc54 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -555,7 +555,7 @@ class parquet_writer_options { // Maximum size of min or max values in column index int32_t _column_index_truncate_length = default_column_index_truncate_length; // When to use dictionary encoding for data - dictionary_policy _dictionary_policy = dictionary_policy::ALWAYS; + dictionary_policy _dictionary_policy = dictionary_policy::ADAPTIVE; // Maximum size of column chunk dictionary (in bytes) size_t _max_dictionary_size = default_max_dictionary_size; // Maximum number of rows in a page fragment From d897d94a36c2d222337115860d7f22f3ee2a0e2d Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 19 Apr 2024 22:12:42 +0000 Subject: [PATCH 02/14] updating default dict policy for python to ADAPTIVE as well --- python/cudf/cudf/_lib/parquet.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 9ce9aad18f7..b85941d109f 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -478,7 +478,7 @@ def write_parquet( ) dict_policy = ( - cudf_io_types.dictionary_policy.ALWAYS + cudf_io_types.dictionary_policy.ADAPTIVE if use_dictionary else cudf_io_types.dictionary_policy.NEVER ) From 96b35f8baab9730372b11ab50efaf22b5033e19b Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Mon, 22 Apr 2024 21:44:38 +0000 Subject: [PATCH 03/14] minor comment update. --- cpp/include/cudf/io/parquet.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 60ad86dcc54..bc7d3fec929 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -1067,7 +1067,7 @@ class parquet_writer_options_builder { * dictionary_policy::ALWAYS will allow the use of dictionary encoding even if it will result in * the disabling of compression for columns that would otherwise be compressed. * - * The default value is dictionary_policy::ALWAYS. + * The default value is dictionary_policy::ADAPTIVE. * * @param val policy for dictionary use * @return this for chaining From 144dedf3066aede58045c09452fa95a60a27fbfd Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Tue, 7 May 2024 21:08:10 +0000 Subject: [PATCH 04/14] updates for dict_policy to accept boolean as well as strings --- python/cudf/cudf/_lib/parquet.pyx | 13 +++++++------ python/cudf/cudf/io/parquet.py | 20 ++++++++++++++++++-- python/cudf/cudf/tests/test_parquet.py | 19 +++++++++++++++++++ 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index b85941d109f..fbbaba69b62 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -402,7 +402,7 @@ def write_parquet( object partitions_info=None, object force_nullable_schema=False, header_version="1.0", - use_dictionary=True, + use_dictionary="ADAPTIVE", ): """ Cython function to call into libcudf API, see `write_parquet`. @@ -477,11 +477,12 @@ def write_parquet( "Valid values are '1.0' and '2.0'" ) - dict_policy = ( - cudf_io_types.dictionary_policy.ADAPTIVE - if use_dictionary - else cudf_io_types.dictionary_policy.NEVER - ) + # Set up the dictionary policy + dict_policy = cudf_io_types.dictionary_policy.ADAPTIVE + if use_dictionary == "ALWAYS": + dict_policy = cudf_io_types.dictionary_policy.ALWAYS + elif use_dictionary == "NEVER": + dict_policy = cudf_io_types.dictionary_policy.NEVER cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression) cdef cudf_io_types.statistics_freq stat_freq = _get_stat_freq(statistics) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index e7f1ad0751f..72a204fd0db 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -67,7 +67,7 @@ def _write_parquet( storage_options=None, force_nullable_schema=False, header_version="1.0", - use_dictionary=True, + use_dictionary="ADAPTIVE", ): if is_list_like(paths) and len(paths) > 1: if partitions_info is None: @@ -962,6 +962,22 @@ def to_parquet( if partition_offsets is not None else None ) + + # Set up the dictionary policy + dict_policy=None + if use_dictionary == True or use_dictionary == "ADAPTIVE": + dict_policy="ADAPTIVE" + elif use_dictionary == False or use_dictionary == "NEVER": + dict_policy="NEVER" + elif use_dictionary == "ALWAYS": + dict_policy="ALWAYS" + else: + dict_policy="ADAPTIVE" + warnings.warn( + "invalid value passed for `use_dictionary`." + "Using the default value `use_dictionary=True`" + ) + return _write_parquet( df, paths=path if is_list_like(path) else [path], @@ -978,7 +994,7 @@ def to_parquet( storage_options=storage_options, force_nullable_schema=force_nullable_schema, header_version=header_version, - use_dictionary=use_dictionary, + use_dictionary=dict_policy, ) else: diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 6fb1d3d8ba5..a5da536d8dc 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -3093,6 +3093,25 @@ def test_parquet_reader_detect_bad_dictionary(datadir): with pytest.raises(RuntimeError): cudf.read_parquet(fname) +@pytest.mark.parametrize("policy", [True, False, "ADAPTIVE", "ALWAYS", "NEVER"]) +def test_parquet_dictionary_policy(policy): + buf = BytesIO() + table = cudf.DataFrame( + { + "time64[ms]": cudf.Series([1234, 123, 4123], dtype="timedelta64[ms]"), + "int64": cudf.Series([1234, 123, 4123], dtype="int64"), + "list": list([[1,2],[1,2],[1,2]]), + "datetime[ms]": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), + }) + + # Write parquet with the specified dict policy + table.to_parquet(buf, use_dictionary=policy) + + # Read the parquet back + got = cudf.read_parquet(buf) + + # Check the tables + assert_eq(table, got) @pytest.mark.parametrize("data", [{"a": [1, 2, 3, 4]}, {"b": [1, None, 2, 3]}]) @pytest.mark.parametrize("force_nullable_schema", [True, False]) From 1e6e982e2823c06e5bc1a02a04f6b21cc45dfe7f Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Tue, 7 May 2024 22:14:33 +0000 Subject: [PATCH 05/14] Revert "updates for dict_policy to accept boolean as well as strings" This reverts commit 144dedf3066aede58045c09452fa95a60a27fbfd. --- python/cudf/cudf/_lib/parquet.pyx | 13 ++++++------- python/cudf/cudf/io/parquet.py | 20 ++------------------ python/cudf/cudf/tests/test_parquet.py | 19 ------------------- 3 files changed, 8 insertions(+), 44 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index fbbaba69b62..b85941d109f 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -402,7 +402,7 @@ def write_parquet( object partitions_info=None, object force_nullable_schema=False, header_version="1.0", - use_dictionary="ADAPTIVE", + use_dictionary=True, ): """ Cython function to call into libcudf API, see `write_parquet`. @@ -477,12 +477,11 @@ def write_parquet( "Valid values are '1.0' and '2.0'" ) - # Set up the dictionary policy - dict_policy = cudf_io_types.dictionary_policy.ADAPTIVE - if use_dictionary == "ALWAYS": - dict_policy = cudf_io_types.dictionary_policy.ALWAYS - elif use_dictionary == "NEVER": - dict_policy = cudf_io_types.dictionary_policy.NEVER + dict_policy = ( + cudf_io_types.dictionary_policy.ADAPTIVE + if use_dictionary + else cudf_io_types.dictionary_policy.NEVER + ) cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression) cdef cudf_io_types.statistics_freq stat_freq = _get_stat_freq(statistics) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 72a204fd0db..e7f1ad0751f 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -67,7 +67,7 @@ def _write_parquet( storage_options=None, force_nullable_schema=False, header_version="1.0", - use_dictionary="ADAPTIVE", + use_dictionary=True, ): if is_list_like(paths) and len(paths) > 1: if partitions_info is None: @@ -962,22 +962,6 @@ def to_parquet( if partition_offsets is not None else None ) - - # Set up the dictionary policy - dict_policy=None - if use_dictionary == True or use_dictionary == "ADAPTIVE": - dict_policy="ADAPTIVE" - elif use_dictionary == False or use_dictionary == "NEVER": - dict_policy="NEVER" - elif use_dictionary == "ALWAYS": - dict_policy="ALWAYS" - else: - dict_policy="ADAPTIVE" - warnings.warn( - "invalid value passed for `use_dictionary`." - "Using the default value `use_dictionary=True`" - ) - return _write_parquet( df, paths=path if is_list_like(path) else [path], @@ -994,7 +978,7 @@ def to_parquet( storage_options=storage_options, force_nullable_schema=force_nullable_schema, header_version=header_version, - use_dictionary=dict_policy, + use_dictionary=use_dictionary, ) else: diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index a5da536d8dc..6fb1d3d8ba5 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -3093,25 +3093,6 @@ def test_parquet_reader_detect_bad_dictionary(datadir): with pytest.raises(RuntimeError): cudf.read_parquet(fname) -@pytest.mark.parametrize("policy", [True, False, "ADAPTIVE", "ALWAYS", "NEVER"]) -def test_parquet_dictionary_policy(policy): - buf = BytesIO() - table = cudf.DataFrame( - { - "time64[ms]": cudf.Series([1234, 123, 4123], dtype="timedelta64[ms]"), - "int64": cudf.Series([1234, 123, 4123], dtype="int64"), - "list": list([[1,2],[1,2],[1,2]]), - "datetime[ms]": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), - }) - - # Write parquet with the specified dict policy - table.to_parquet(buf, use_dictionary=policy) - - # Read the parquet back - got = cudf.read_parquet(buf) - - # Check the tables - assert_eq(table, got) @pytest.mark.parametrize("data", [{"a": [1, 2, 3, 4]}, {"b": [1, None, 2, 3]}]) @pytest.mark.parametrize("force_nullable_schema", [True, False]) From ed9a324ec3ff9c34658152a0f6c2abb7870d10a4 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Tue, 7 May 2024 23:59:50 +0000 Subject: [PATCH 06/14] add max_dictionary_size argument to control the adaptive dictionary behavior --- python/cudf/cudf/_lib/cpp/io/parquet.pxd | 5 +++++ python/cudf/cudf/_lib/parquet.pyx | 3 +++ python/cudf/cudf/io/parquet.py | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd index 8de16d06a9d..2e277f18f78 100644 --- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd +++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd @@ -74,6 +74,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: size_type get_row_group_size_rows() except + size_t get_max_page_size_bytes() except + size_type get_max_page_size_rows() except + + size_t get_max_dictionary_size() except+ void set_partitions( vector[cudf_io_types.partition_info] partitions @@ -103,6 +104,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_row_group_size_rows(size_type val) except + void set_max_page_size_bytes(size_t val) except + void set_max_page_size_rows(size_type val) except + + void set_max_dictionary_size(size_t val) except + void enable_write_v2_headers(bool val) except + void set_dictionary_policy(cudf_io_types.dictionary_policy policy)except + @@ -155,6 +157,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: parquet_writer_options_builder& max_page_size_rows( size_type val ) except + + parquet_writer_options_builder& max_dictionary_page_size( + size_t val + ) except + parquet_writer_options_builder& write_v2_headers( bool val ) except + diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index b85941d109f..2e0f20f8324 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -399,6 +399,7 @@ def write_parquet( object row_group_size_rows=None, object max_page_size_bytes=None, object max_page_size_rows=None, + object max_dictionary_size=None, object partitions_info=None, object force_nullable_schema=False, header_version="1.0", @@ -528,6 +529,8 @@ def write_parquet( args.set_max_page_size_bytes(max_page_size_bytes) if max_page_size_rows is not None: args.set_max_page_size_rows(max_page_size_rows) + if max_dictionary_size is not None: + args.set_max_dictionary_size(max_dictionary_size) with nogil: out_metadata_c = move(parquet_writer(args)) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index e7f1ad0751f..7a34cc906e2 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -63,6 +63,7 @@ def _write_parquet( row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, + max_dictionary_size=None, partitions_info=None, storage_options=None, force_nullable_schema=False, @@ -96,6 +97,7 @@ def _write_parquet( "row_group_size_rows": row_group_size_rows, "max_page_size_bytes": max_page_size_bytes, "max_page_size_rows": max_page_size_rows, + "max_dictionary_size": max_dictionary_size, "partitions_info": partitions_info, "force_nullable_schema": force_nullable_schema, "header_version": header_version, @@ -898,6 +900,7 @@ def to_parquet( row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, + max_dictionary_size=None, storage_options=None, return_metadata=False, force_nullable_schema=False, @@ -974,6 +977,7 @@ def to_parquet( row_group_size_rows=row_group_size_rows, max_page_size_bytes=max_page_size_bytes, max_page_size_rows=max_page_size_rows, + max_dictionary_size=max_dictionary_size, partitions_info=partition_info, storage_options=storage_options, force_nullable_schema=force_nullable_schema, From 992ec9fd501f3909c3bc9e0ed38d78380ebec031 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 8 May 2024 00:25:17 +0000 Subject: [PATCH 07/14] add docstring for max_dictionary_size --- python/cudf/cudf/utils/ioutils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 6bd7558d322..49afe76a952 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -277,6 +277,10 @@ max_page_size_rows: integer or None, default None Maximum number of rows of each page of the output. If None, 20000 will be used. +max_dictionary_size: integer or None. default None + Maximum size of dictionary page for each output column chunks. Dictionary + encoding for column chunks that exceeds this limit will be disabled. + If None, 1048576 (1MB) will be used. storage_options : dict, optional, default None Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value @@ -291,8 +295,8 @@ ``return_metadata=True`` instead of specifying ``metadata_file_path`` use_dictionary : bool, default True When ``False``, prevents the use of dictionary encoding for Parquet page - data. When ``True``, dictionary encoding is preferred when not disabled due - to dictionary size constraints. + data. When ``True``, dictionary encoding is preferred subjected to + ``max_dictionary_size`` constraints. header_version : {{'1.0', '2.0'}}, default "1.0" Controls whether to use version 1.0 or version 2.0 page headers when encoding. Version 1.0 is more portable, but version 2.0 enables the From bf296d45006c7997f04a035f39d96e79ec7e4935 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 8 May 2024 03:00:41 +0000 Subject: [PATCH 08/14] minor trim whitespaces hook changes --- python/cudf/cudf/utils/ioutils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 49afe76a952..2868c24bb1a 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -278,7 +278,7 @@ Maximum number of rows of each page of the output. If None, 20000 will be used. max_dictionary_size: integer or None. default None - Maximum size of dictionary page for each output column chunks. Dictionary + Maximum size of dictionary page for each output column chunks. Dictionary encoding for column chunks that exceeds this limit will be disabled. If None, 1048576 (1MB) will be used. storage_options : dict, optional, default None From 187729f663486af514b68b86344d97ef782e9545 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 8 May 2024 10:32:52 -0700 Subject: [PATCH 09/14] Apply suggestions from code review Co-authored-by: Ed Seidl --- python/cudf/cudf/utils/ioutils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 2868c24bb1a..2ccaa411f93 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -278,7 +278,7 @@ Maximum number of rows of each page of the output. If None, 20000 will be used. max_dictionary_size: integer or None. default None - Maximum size of dictionary page for each output column chunks. Dictionary + Maximum size of the dictionary page for each output column chunk. Dictionary encoding for column chunks that exceeds this limit will be disabled. If None, 1048576 (1MB) will be used. storage_options : dict, optional, default None @@ -295,7 +295,7 @@ ``return_metadata=True`` instead of specifying ``metadata_file_path`` use_dictionary : bool, default True When ``False``, prevents the use of dictionary encoding for Parquet page - data. When ``True``, dictionary encoding is preferred subjected to + data. When ``True``, dictionary encoding is preferred subject to ``max_dictionary_size`` constraints. header_version : {{'1.0', '2.0'}}, default "1.0" Controls whether to use version 1.0 or version 2.0 page headers when From 3d569300eed8636dc1ff0b47817182a0728d3dd1 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 8 May 2024 17:34:34 +0000 Subject: [PATCH 10/14] dictionary updates for the chunked writer --- cpp/include/cudf/io/parquet.hpp | 4 ++-- python/cudf/cudf/_lib/cpp/io/parquet.pxd | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 25a306f7275..8bfcacdb47f 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -1258,7 +1258,7 @@ class chunked_parquet_writer_options { // Maximum size of min or max values in column index int32_t _column_index_truncate_length = default_column_index_truncate_length; // When to use dictionary encoding for data - dictionary_policy _dictionary_policy = dictionary_policy::ALWAYS; + dictionary_policy _dictionary_policy = dictionary_policy::ADAPTIVE; // Maximum size of column chunk dictionary (in bytes) size_t _max_dictionary_size = default_max_dictionary_size; // Maximum number of rows in a page fragment @@ -1751,7 +1751,7 @@ class chunked_parquet_writer_options_builder { * dictionary_policy::ALWAYS will allow the use of dictionary encoding even if it will result in * the disabling of compression for columns that would otherwise be compressed. * - * The default value is dictionary_policy::ALWAYS. + * The default value is dictionary_policy::ADAPTIVE. * * @param val policy for dictionary use * @return this for chaining diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd index 2e277f18f78..1680eb43700 100644 --- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd +++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd @@ -74,7 +74,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: size_type get_row_group_size_rows() except + size_t get_max_page_size_bytes() except + size_type get_max_page_size_rows() except + - size_t get_max_dictionary_size() except+ + size_t get_max_dictionary_size() except + void set_partitions( vector[cudf_io_types.partition_info] partitions @@ -106,7 +106,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_max_page_size_rows(size_type val) except + void set_max_dictionary_size(size_t val) except + void enable_write_v2_headers(bool val) except + - void set_dictionary_policy(cudf_io_types.dictionary_policy policy)except + + void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except + @staticmethod parquet_writer_options_builder builder( @@ -157,7 +157,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: parquet_writer_options_builder& max_page_size_rows( size_type val ) except + - parquet_writer_options_builder& max_dictionary_page_size( + parquet_writer_options_builder& max_dictionary_size( size_t val ) except + parquet_writer_options_builder& write_v2_headers( @@ -184,6 +184,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: size_type get_row_group_size_rows() except + size_t get_max_page_size_bytes() except + size_type get_max_page_size_rows() except + + size_t get_max_dictionary_size() except + void set_metadata( cudf_io_types.table_input_metadata m @@ -207,8 +208,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_row_group_size_rows(size_type val) except + void set_max_page_size_bytes(size_t val) except + void set_max_page_size_rows(size_type val) except + + void set_max_dictionary_size(size_t val) except + void enable_write_v2_headers(bool val) except + - void set_dictionary_policy(cudf_io_types.dictionary_policy policy)except + + void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except + @staticmethod chunked_parquet_writer_options_builder builder( @@ -250,6 +252,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: chunked_parquet_writer_options_builder& max_page_size_rows( size_type val ) except + + chunked_parquet_writer_options_builder& max_dictionary_size( + size_t val + ) except + parquet_writer_options_builder& write_v2_headers( bool val ) except + From 7162c0eecd7222e7ab87e63c5182bac07b45188e Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 8 May 2024 19:39:26 +0000 Subject: [PATCH 11/14] add dictionary options to ParquetWriter class as well --- python/cudf/cudf/_lib/parquet.pyx | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 2e0f20f8324..48e2323bd06 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -574,7 +574,14 @@ cdef class ParquetWriter: max_page_size_rows: int, default 20000 Maximum number of rows of each page of the output. By default, 20000 will be used. - + max_dictionary_size: int, default 1048576 + Maximum size of the dictionary page for each output column chunk. Dictionary + encoding for column chunks that exceeds this limit will be disabled. + By default, 1048576 (1MB) will be used. + use_dictionary : bool, default True + If ``True``, enable dictionary encoding for Parquet page data + subject to ``max_dictionary_size`` constraints. + If ``False``, disable dictionary encoding for Parquet page data. See Also -------- cudf.io.parquet.write_parquet @@ -591,13 +598,17 @@ cdef class ParquetWriter: cdef size_type row_group_size_rows cdef size_t max_page_size_bytes cdef size_type max_page_size_rows + cdef size_t max_dictionary_size + cdef cudf_io_types.dictionary_policy dict_policy def __cinit__(self, object filepath_or_buffer, object index=None, object compression="snappy", str statistics="ROWGROUP", int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, int row_group_size_rows=1000000, int max_page_size_bytes=524288, - int max_page_size_rows=20000): + int max_page_size_rows=20000, + int max_dictionary_size=1048576, + bool use_dictionary=True): filepaths_or_buffers = ( list(filepath_or_buffer) if is_list_like(filepath_or_buffer) @@ -612,6 +623,11 @@ cdef class ParquetWriter: self.row_group_size_rows = row_group_size_rows self.max_page_size_bytes = max_page_size_bytes self.max_page_size_rows = max_page_size_rows + self.max_dictionary_size = ( + cudf_io_types.dictionary_policy.ADAPTIVE + if use_dictionary + else cudf_io_types.dictionary_policy.NEVER + ) def write_table(self, table, object partitions_info=None): """ Writes a single table to the file """ @@ -729,8 +745,10 @@ cdef class ParquetWriter: .row_group_size_rows(self.row_group_size_rows) .max_page_size_bytes(self.max_page_size_bytes) .max_page_size_rows(self.max_page_size_rows) + .max_dictionary_size(self.max_dictionary_size) .build() ) + args.set_dictionary_policy(self.dict_policy) self.writer.reset(new cpp_parquet_chunked_writer(args)) self.initialized = True From 233b3f8239c31504d66d987f68dfdca852acde01 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 8 May 2024 21:52:09 +0000 Subject: [PATCH 12/14] minor error fix --- python/cudf/cudf/_lib/parquet.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 48e2323bd06..dcfa087a1fa 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -623,7 +623,8 @@ cdef class ParquetWriter: self.row_group_size_rows = row_group_size_rows self.max_page_size_bytes = max_page_size_bytes self.max_page_size_rows = max_page_size_rows - self.max_dictionary_size = ( + self.max_dictionary_size = max_dictionary_size + self.dict_policy = ( cudf_io_types.dictionary_policy.ADAPTIVE if use_dictionary else cudf_io_types.dictionary_policy.NEVER From 729a609684909b5e23bdaa3076da9605dae42f3b Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 9 May 2024 02:00:09 +0000 Subject: [PATCH 13/14] add pytests for dictionary settings in ParquetWriter --- python/cudf/cudf/tests/test_parquet.py | 37 ++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 6fb1d3d8ba5..95bc8e4fcae 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1882,6 +1882,43 @@ def test_parquet_writer_max_page_size(tmpdir, max_page_size_kwargs): assert s1 > s2 +@pytest.mark.parametrize("use_dict", [False, True]) +@pytest.mark.parametrize("max_dict_size", [0, 1048576]) +def test_parquet_writer_dictionary_setting(use_dict, max_dict_size): + # Simple test for checking the validity of dictionary encoding setting + # and behavior of ParquetWriter in cudf. + # Write a table with repetitive data with varying dictionary settings. + # Make sure the written columns are dictionary-encoded accordingly. + + # Table with repetitive data + table = cudf.DataFrame( + { + "int32": cudf.Series([1024] * 1024, dtype="int64"), + } + ) + + # Write to Parquet using ParquetWriter + buffer = BytesIO() + writer = ParquetWriter( + buffer, + use_dictionary=use_dict, + max_dictionary_size=max_dict_size, + ) + writer.write_table(table) + writer.close() + + # Read encodings from parquet file + got = pq.ParquetFile(buffer) + encodings = got.metadata.row_group(0).column(0).encodings + + # Check for `PLAIN_DICTIONARY` encoding if dictionary encoding enabled + # and dictionary page limit > 0 + if use_dict is True and max_dict_size > 0: + assert "PLAIN_DICTIONARY" in encodings + else: + assert "PLAIN_DICTIONARY" not in encodings + + @pytest.mark.parametrize("filename", ["myfile.parquet", None]) @pytest.mark.parametrize("cols", [["b"], ["c", "b"]]) def test_parquet_partitioned(tmpdir_factory, cols, filename): From 3900120d4306b17971b783e801fe55e1a72a3f67 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 10 May 2024 15:22:45 -0700 Subject: [PATCH 14/14] Update comment for consistency Co-authored-by: Bradley Dice --- python/cudf/cudf/utils/ioutils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 2ccaa411f93..e52b14e4745 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -277,7 +277,7 @@ max_page_size_rows: integer or None, default None Maximum number of rows of each page of the output. If None, 20000 will be used. -max_dictionary_size: integer or None. default None +max_dictionary_size: integer or None, default None Maximum size of the dictionary page for each output column chunk. Dictionary encoding for column chunks that exceeds this limit will be disabled. If None, 1048576 (1MB) will be used.