From 184bf5e2643426738e24bc136cdd62f475f1a5fd Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 24 Jan 2024 15:13:25 -0800 Subject: [PATCH] Add row index and stripe size options to Python ORC chunked writer (#14785) Adds the APIs that control the stripe/row group size when using the chunked writer. This functions are already present in to_orc (non-chunked version of the same API). Adding this options to facilitate smaller unit tests. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14785 --- python/cudf/cudf/_lib/orc.pyx | 25 ++++++++++++++++++++----- python/cudf/cudf/tests/test_orc.py | 22 ++++++++++++++++++++++ 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index c64296eb7da..2cbdf76030b 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -375,13 +375,19 @@ cdef class ORCWriter: cdef object index cdef table_input_metadata tbl_meta cdef object cols_as_map_type + cdef object stripe_size_bytes + cdef object stripe_size_rows + cdef object row_index_stride def __cinit__(self, object path, object index=None, object compression="snappy", object statistics="ROWGROUP", - object cols_as_map_type=None): + object cols_as_map_type=None, + object stripe_size_bytes=None, + object stripe_size_rows=None, + object row_index_stride=None): self.sink = make_sink_info(path, self._data_sink) self.stat_freq = _get_orc_stat_freq(statistics) @@ -389,6 +395,9 @@ cdef class ORCWriter: self.index = index self.cols_as_map_type = cols_as_map_type \ if cols_as_map_type is None else set(cols_as_map_type) + self.stripe_size_bytes = stripe_size_bytes + self.stripe_size_rows = stripe_size_rows + self.row_index_stride = row_index_stride self.initialized = False def write_table(self, table): @@ -456,9 +465,7 @@ cdef class ORCWriter: pandas_metadata = generate_pandas_metadata(table, self.index) user_data[str.encode("pandas")] = str.encode(pandas_metadata) - cdef chunked_orc_writer_options args - with nogil: - args = move( + cdef chunked_orc_writer_options c_opts = move( chunked_orc_writer_options.builder(self.sink) .metadata(self.tbl_meta) .key_value_metadata(move(user_data)) @@ -466,7 +473,15 @@ cdef class ORCWriter: .enable_statistics(self.stat_freq) .build() ) - self.writer.reset(new orc_chunked_writer(args)) + if self.stripe_size_bytes is not None: + c_opts.set_stripe_size_bytes(self.stripe_size_bytes) + if self.stripe_size_rows is not None: + c_opts.set_stripe_size_rows(self.stripe_size_rows) + if self.row_index_stride is not None: + c_opts.set_row_index_stride(self.row_index_stride) + + with nogil: + self.writer.reset(new orc_chunked_writer(c_opts)) self.initialized = True diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 4630b6eef0a..6b7f86098a0 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1911,3 +1911,25 @@ def test_orc_reader_empty_deeply_nested_level(datadir): got = cudf.read_orc(path) assert_eq(expect, got) + + +def test_orc_chunked_writer_stripe_size(datadir): + from pyarrow import orc + + df = cudf.DataFrame({"col": gen_rand_series("int", 100000)}) + + buffer = BytesIO() + writer = ORCWriter(buffer, stripe_size_bytes=64 * 1024) + writer.write_table(df) + writer.close() + + orc_file = orc.ORCFile(buffer) + assert_eq(orc_file.nstripes, 10) + + buffer = BytesIO() + writer = ORCWriter(buffer, stripe_size_rows=20000) + writer.write_table(df) + writer.close() + + orc_file = orc.ORCFile(buffer) + assert_eq(orc_file.nstripes, 5)