From b1f60d82cbe66ab67ba57b29898cc9e14573c07a Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 17 Jan 2024 14:59:23 -0800 Subject: [PATCH 1/5] add stripe size support to chunked orc writer --- python/cudf/cudf/_lib/orc.pyx | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 0ae039b14d2..663cbeadf7f 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import cudf from cudf.core.buffer import acquire_spill_lock @@ -376,13 +376,17 @@ cdef class ORCWriter: cdef object index cdef table_input_metadata tbl_meta cdef object cols_as_map_type + cdef object stripe_size_bytes + cdef object stripe_size_rows def __cinit__(self, object path, object index=None, object compression="snappy", object statistics="ROWGROUP", - object cols_as_map_type=None): + object cols_as_map_type=None, + object stripe_size_bytes=None, + object stripe_size_rows=None): self.sink = make_sink_info(path, self._data_sink) self.stat_freq = _get_orc_stat_freq(statistics) @@ -390,6 +394,8 @@ cdef class ORCWriter: self.index = index self.cols_as_map_type = cols_as_map_type \ if cols_as_map_type is None else set(cols_as_map_type) + self.stripe_size_bytes = stripe_size_bytes + self.stripe_size_rows = stripe_size_rows self.initialized = False def write_table(self, table): @@ -457,9 +463,7 @@ cdef class ORCWriter: pandas_metadata = generate_pandas_metadata(table, self.index) user_data[str.encode("pandas")] = str.encode(pandas_metadata) - cdef chunked_orc_writer_options args - with nogil: - args = move( + cdef chunked_orc_writer_options c_opts = move( chunked_orc_writer_options.builder(self.sink) .metadata(self.tbl_meta) .key_value_metadata(move(user_data)) @@ -467,7 +471,13 @@ cdef class ORCWriter: .enable_statistics(self.stat_freq) .build() ) - self.writer.reset(new orc_chunked_writer(args)) + if self.stripe_size_bytes is not None: + c_opts.set_stripe_size_bytes(self.stripe_size_bytes) + if self.stripe_size_rows is not None: + c_opts.set_stripe_size_rows(self.stripe_size_rows) + + with nogil: + self.writer.reset(new orc_chunked_writer(c_opts)) self.initialized = True From 98c09a44f07f42552c3095fdebd8802df9a9d425 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 18 Jan 2024 12:40:11 -0800 Subject: [PATCH 2/5] tests --- python/cudf/cudf/tests/test_orc.py | 121 +++++++++++------------------ 1 file changed, 46 insertions(+), 75 deletions(-) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 4630b6eef0a..1d8a95faa65 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -93,9 +93,7 @@ def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine): path = datadir / inputfile expect = pd.read_orc(path, columns=columns) - got = cudf.read_orc( - path, engine=engine, columns=columns, use_index=use_index - ) + got = cudf.read_orc(path, engine=engine, columns=columns, use_index=use_index) assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False) @@ -116,9 +114,7 @@ def test_orc_reader_local_filepath(): cudf.read_orc(path) -@pytest.mark.parametrize( - "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"] -) +@pytest.mark.parametrize("src", ["filepath", "pathobj", "bytes_io", "bytes", "url"]) def test_orc_reader_filepath_or_buffer(path_or_buf, src): cols = ["int1", "long1", "float1", "double1"] @@ -253,17 +249,12 @@ def test_orc_read_stripes(datadir, engine): num_rows, stripes, col_names = cudf.io.read_orc_metadata(path) # Read stripes one at a time - gdf = [ - cudf.read_orc(path, engine=engine, stripes=[[i]]) - for i in range(stripes) - ] + gdf = [cudf.read_orc(path, engine=engine, stripes=[[i]]) for i in range(stripes)] gdf = cudf.concat(gdf).reset_index(drop=True) assert_eq(pdf, gdf, check_categorical=False, check_index_type=True) # Read stripes all at once - gdf = cudf.read_orc( - path, engine=engine, stripes=[[int(x) for x in range(stripes)]] - ) + gdf = cudf.read_orc(path, engine=engine, stripes=[[int(x) for x in range(stripes)]]) assert_eq(pdf, gdf, check_categorical=False) # Read only some stripes @@ -322,9 +313,7 @@ def test_orc_read_skiprows(): # repro for other sizes of data skiprows = 10 - expected = ( - pd.read_orc(buff)[skiprows:].reset_index(drop=True).astype("bool") - ) + expected = pd.read_orc(buff)[skiprows:].reset_index(drop=True).astype("bool") got = cudf.read_orc(buff, skiprows=skiprows) assert_eq(expected, got) @@ -438,9 +427,7 @@ def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): ("TestOrcFile.demo-12-zlib.orc", ["_col1", "_col3", "_col5"]), ], ) -def test_chunked_orc_writer( - datadir, tmpdir, reference_file, columns, compression -): +def test_chunked_orc_writer(datadir, tmpdir, reference_file, columns, compression): pdf_fname = datadir / reference_file gdf_fname = tmpdir.join("chunked_gdf.orc") @@ -669,9 +656,7 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): assert normalized_equals(actual_max, stats_max) if "number_of_values" in stripes_stats[stripe_idx][col]: - stats_num_vals = stripes_stats[stripe_idx][col][ - "number_of_values" - ] + stats_num_vals = stripes_stats[stripe_idx][col]["number_of_values"] if stats_num_vals is not None: actual_num_vals = stripe_df[col].count() assert stats_num_vals == actual_num_vals @@ -777,9 +762,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): assert normalized_equals(actual_max, stats_max) if "number_of_values" in stripes_stats[stripe_idx][col]: - stats_num_vals = stripes_stats[stripe_idx][col][ - "number_of_values" - ] + stats_num_vals = stripes_stats[stripe_idx][col]["number_of_values"] if stats_num_vals is not None: actual_num_vals = stripe_df[col].count() assert stats_num_vals == actual_num_vals @@ -827,12 +810,8 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows): assert normalized_equals(actual_true_count, stats_true_count) if "number_of_values" in stripes_stats[stripe_idx][col]: - actual_valid_count = ( - len(stripe_df[col]) - stripe_df[col].null_count - ) - stats_valid_count = stripes_stats[stripe_idx][col][ - "number_of_values" - ] + actual_valid_count = len(stripe_df[col]) - stripe_df[col].null_count + stats_valid_count = stripes_stats[stripe_idx][col]["number_of_values"] assert normalized_equals(actual_valid_count, stats_valid_count) @@ -901,9 +880,7 @@ def test_empty_dataframe(): assert_eq(expected_pdf, got_df) -@pytest.mark.parametrize( - "data", [[None, ""], ["", None], [None, None], ["", ""]] -) +@pytest.mark.parametrize("data", [[None, ""], ["", None], [None, None], ["", ""]]) def test_empty_string_columns(data): buffer = BytesIO() @@ -1146,9 +1123,7 @@ def gen_map_buff(size=10000): [ None, { - rd.choice(al): rd.choice( - [None, np.random.randint(1, 1500)] - ), + rd.choice(al): rd.choice([None, np.random.randint(1, 1500)]), }, ] ) @@ -1167,9 +1142,7 @@ def gen_map_buff(size=10000): [ None, [ - rd.choice( - [None, np.random.randint(1, 1500)] - ) + rd.choice([None, np.random.randint(1, 1500)]) for _ in range(5) ], ] @@ -1210,9 +1183,7 @@ def gen_map_buff(size=10000): ) for _ in range(size) ], - type=pa.map_( - pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()}) - ), + type=pa.map_(pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()})), ) pa_table = pa.Table.from_arrays( @@ -1220,9 +1191,7 @@ def gen_map_buff(size=10000): ["lvl1_map", "lvl2_map", "lvl2_struct_map"], ) - orc.write_table( - pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED" - ) + orc.write_table(pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED") return buff @@ -1249,11 +1218,7 @@ def test_map_type_read(columns, num_rows, use_index): lvl2_map = ( tbl["lvl2_map"] .combine_chunks() - .view( - pa.list_( - pa.struct({"key": pa.string(), "value": pa.list_(pa.int64())}) - ) - ) + .view(pa.list_(pa.struct({"key": pa.string(), "value": pa.list_(pa.int64())}))) ) lvl2_struct_map = ( tbl["lvl2_struct_map"] @@ -1344,21 +1309,13 @@ def dec(num): }, # with empty elements { - "ls": [ - [str(i), str(2 * i)] if i % 2 else [] for i in range(12345) - ], + "ls": [[str(i), str(2 * i)] if i % 2 else [] for i in range(12345)], "lls": [ - [[str(i), str(2 * i)]] if i % 2 else [[], []] - for i in range(12345) + [[str(i), str(2 * i)]] if i % 2 else [[], []] for i in range(12345) ], "li": [[i, i * i, i % 2] if i % 3 else [] for i in range(12345)], - "lli": [ - [[i], [i * i], [i % 2]] if i % 3 else [[]] - for i in range(12345) - ], - "ld": [ - [dec(i), dec(i / 2)] if i % 5 else [] for i in range(12345) - ], + "lli": [[[i], [i * i], [i % 2]] if i % 3 else [[]] for i in range(12345)], + "ld": [[dec(i), dec(i / 2)] if i % 5 else [] for i in range(12345)], }, # variable list lengths { @@ -1374,9 +1331,7 @@ def test_orc_writer_lists(data): pdf_in = pd.DataFrame(data) buffer = BytesIO() - cudf.from_pandas(pdf_in).to_orc( - buffer, stripe_size_rows=2048, row_index_stride=512 - ) + cudf.from_pandas(pdf_in).to_orc(buffer, stripe_size_rows=2048, row_index_stride=512) pdf_out = pd.read_orc(buffer) assert_eq(pdf_out, pdf_in) @@ -1498,9 +1453,7 @@ def test_statistics_sum_overflow(): minint64 = np.iinfo(np.int64).min buff = BytesIO() - df = pd.DataFrame( - {"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]} - ) + df = pd.DataFrame({"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]}) df.to_orc(buff) file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) @@ -1663,9 +1616,7 @@ def test_orc_writer_nvcomp(compression): def run_orc_columns_and_index_param(index_obj, index, columns): buffer = BytesIO() - df = cudf.DataFrame( - {"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=index_obj - ) + df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=index_obj) df.to_orc(buffer, index=index) expected = pd.read_orc(buffer, columns=columns) @@ -1793,9 +1744,7 @@ def test_orc_writer_cols_as_map_type(df_data, cols_as_map_type, expected_data): def test_orc_writer_cols_as_map_type_error(): - df = cudf.DataFrame( - {"a": cudf.Series([[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]])} - ) + df = cudf.DataFrame({"a": cudf.Series([[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]])}) buffer = BytesIO() with pytest.raises( TypeError, match="cols_as_map_type must be a list of column names." @@ -1911,3 +1860,25 @@ def test_orc_reader_empty_deeply_nested_level(datadir): got = cudf.read_orc(path) assert_eq(expect, got) + + +def test_orc_chunked_writer_stripe_size(datadir): + from pyarrow import orc + + df = cudf.DataFrame({"col": gen_rand_series("int", 100000)}) + + buffer = BytesIO() + writer = ORCWriter(buffer, stripe_size_bytes=64 * 1024) + writer.write_table(df) + writer.close() + + orc_file = orc.ORCFile(buffer) + assert_eq(orc_file.nstripes, 10) + + buffer = BytesIO() + writer = ORCWriter(buffer, stripe_size_rows=20000) + writer.write_table(df) + writer.close() + + orc_file = orc.ORCFile(buffer) + assert_eq(orc_file.nstripes, 5) From 83201e9a9b38d07078945efcf750e690cb3a4702 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 18 Jan 2024 12:46:05 -0800 Subject: [PATCH 3/5] row index stride --- python/cudf/cudf/_lib/orc.pyx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 663cbeadf7f..49d93402c82 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -378,6 +378,7 @@ cdef class ORCWriter: cdef object cols_as_map_type cdef object stripe_size_bytes cdef object stripe_size_rows + cdef object row_index_stride def __cinit__(self, object path, @@ -386,7 +387,8 @@ cdef class ORCWriter: object statistics="ROWGROUP", object cols_as_map_type=None, object stripe_size_bytes=None, - object stripe_size_rows=None): + object stripe_size_rows=None, + object row_index_stride=None): self.sink = make_sink_info(path, self._data_sink) self.stat_freq = _get_orc_stat_freq(statistics) @@ -396,6 +398,7 @@ cdef class ORCWriter: if cols_as_map_type is None else set(cols_as_map_type) self.stripe_size_bytes = stripe_size_bytes self.stripe_size_rows = stripe_size_rows + self.row_index_stride = row_index_stride self.initialized = False def write_table(self, table): @@ -475,6 +478,8 @@ cdef class ORCWriter: c_opts.set_stripe_size_bytes(self.stripe_size_bytes) if self.stripe_size_rows is not None: c_opts.set_stripe_size_rows(self.stripe_size_rows) + if self.row_index_stride is not None: + c_opts.set_row_index_stride(self.row_index_stride) with nogil: self.writer.reset(new orc_chunked_writer(c_opts)) From 0b124efac194dc1ae3315c440548523a222bb439 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 18 Jan 2024 12:49:26 -0800 Subject: [PATCH 4/5] style --- python/cudf/cudf/tests/test_orc.py | 105 ++++++++++++++++++++++------- 1 file changed, 80 insertions(+), 25 deletions(-) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 1d8a95faa65..3bc68a3c329 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -93,7 +93,9 @@ def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine): path = datadir / inputfile expect = pd.read_orc(path, columns=columns) - got = cudf.read_orc(path, engine=engine, columns=columns, use_index=use_index) + got = cudf.read_orc( + path, engine=engine, columns=columns, use_index=use_index + ) assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False) @@ -114,7 +116,9 @@ def test_orc_reader_local_filepath(): cudf.read_orc(path) -@pytest.mark.parametrize("src", ["filepath", "pathobj", "bytes_io", "bytes", "url"]) +@pytest.mark.parametrize( + "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"] +) def test_orc_reader_filepath_or_buffer(path_or_buf, src): cols = ["int1", "long1", "float1", "double1"] @@ -249,12 +253,17 @@ def test_orc_read_stripes(datadir, engine): num_rows, stripes, col_names = cudf.io.read_orc_metadata(path) # Read stripes one at a time - gdf = [cudf.read_orc(path, engine=engine, stripes=[[i]]) for i in range(stripes)] + gdf = [ + cudf.read_orc(path, engine=engine, stripes=[[i]]) + for i in range(stripes) + ] gdf = cudf.concat(gdf).reset_index(drop=True) assert_eq(pdf, gdf, check_categorical=False, check_index_type=True) # Read stripes all at once - gdf = cudf.read_orc(path, engine=engine, stripes=[[int(x) for x in range(stripes)]]) + gdf = cudf.read_orc( + path, engine=engine, stripes=[[int(x) for x in range(stripes)]] + ) assert_eq(pdf, gdf, check_categorical=False) # Read only some stripes @@ -313,7 +322,9 @@ def test_orc_read_skiprows(): # repro for other sizes of data skiprows = 10 - expected = pd.read_orc(buff)[skiprows:].reset_index(drop=True).astype("bool") + expected = ( + pd.read_orc(buff)[skiprows:].reset_index(drop=True).astype("bool") + ) got = cudf.read_orc(buff, skiprows=skiprows) assert_eq(expected, got) @@ -427,7 +438,9 @@ def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): ("TestOrcFile.demo-12-zlib.orc", ["_col1", "_col3", "_col5"]), ], ) -def test_chunked_orc_writer(datadir, tmpdir, reference_file, columns, compression): +def test_chunked_orc_writer( + datadir, tmpdir, reference_file, columns, compression +): pdf_fname = datadir / reference_file gdf_fname = tmpdir.join("chunked_gdf.orc") @@ -656,7 +669,9 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): assert normalized_equals(actual_max, stats_max) if "number_of_values" in stripes_stats[stripe_idx][col]: - stats_num_vals = stripes_stats[stripe_idx][col]["number_of_values"] + stats_num_vals = stripes_stats[stripe_idx][col][ + "number_of_values" + ] if stats_num_vals is not None: actual_num_vals = stripe_df[col].count() assert stats_num_vals == actual_num_vals @@ -762,7 +777,9 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): assert normalized_equals(actual_max, stats_max) if "number_of_values" in stripes_stats[stripe_idx][col]: - stats_num_vals = stripes_stats[stripe_idx][col]["number_of_values"] + stats_num_vals = stripes_stats[stripe_idx][col][ + "number_of_values" + ] if stats_num_vals is not None: actual_num_vals = stripe_df[col].count() assert stats_num_vals == actual_num_vals @@ -810,8 +827,12 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows): assert normalized_equals(actual_true_count, stats_true_count) if "number_of_values" in stripes_stats[stripe_idx][col]: - actual_valid_count = len(stripe_df[col]) - stripe_df[col].null_count - stats_valid_count = stripes_stats[stripe_idx][col]["number_of_values"] + actual_valid_count = ( + len(stripe_df[col]) - stripe_df[col].null_count + ) + stats_valid_count = stripes_stats[stripe_idx][col][ + "number_of_values" + ] assert normalized_equals(actual_valid_count, stats_valid_count) @@ -880,7 +901,9 @@ def test_empty_dataframe(): assert_eq(expected_pdf, got_df) -@pytest.mark.parametrize("data", [[None, ""], ["", None], [None, None], ["", ""]]) +@pytest.mark.parametrize( + "data", [[None, ""], ["", None], [None, None], ["", ""]] +) def test_empty_string_columns(data): buffer = BytesIO() @@ -1123,7 +1146,9 @@ def gen_map_buff(size=10000): [ None, { - rd.choice(al): rd.choice([None, np.random.randint(1, 1500)]), + rd.choice(al): rd.choice( + [None, np.random.randint(1, 1500)] + ), }, ] ) @@ -1142,7 +1167,9 @@ def gen_map_buff(size=10000): [ None, [ - rd.choice([None, np.random.randint(1, 1500)]) + rd.choice( + [None, np.random.randint(1, 1500)] + ) for _ in range(5) ], ] @@ -1183,7 +1210,9 @@ def gen_map_buff(size=10000): ) for _ in range(size) ], - type=pa.map_(pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()})), + type=pa.map_( + pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()}) + ), ) pa_table = pa.Table.from_arrays( @@ -1191,7 +1220,9 @@ def gen_map_buff(size=10000): ["lvl1_map", "lvl2_map", "lvl2_struct_map"], ) - orc.write_table(pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED") + orc.write_table( + pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED" + ) return buff @@ -1218,7 +1249,11 @@ def test_map_type_read(columns, num_rows, use_index): lvl2_map = ( tbl["lvl2_map"] .combine_chunks() - .view(pa.list_(pa.struct({"key": pa.string(), "value": pa.list_(pa.int64())}))) + .view( + pa.list_( + pa.struct({"key": pa.string(), "value": pa.list_(pa.int64())}) + ) + ) ) lvl2_struct_map = ( tbl["lvl2_struct_map"] @@ -1309,13 +1344,21 @@ def dec(num): }, # with empty elements { - "ls": [[str(i), str(2 * i)] if i % 2 else [] for i in range(12345)], + "ls": [ + [str(i), str(2 * i)] if i % 2 else [] for i in range(12345) + ], "lls": [ - [[str(i), str(2 * i)]] if i % 2 else [[], []] for i in range(12345) + [[str(i), str(2 * i)]] if i % 2 else [[], []] + for i in range(12345) ], "li": [[i, i * i, i % 2] if i % 3 else [] for i in range(12345)], - "lli": [[[i], [i * i], [i % 2]] if i % 3 else [[]] for i in range(12345)], - "ld": [[dec(i), dec(i / 2)] if i % 5 else [] for i in range(12345)], + "lli": [ + [[i], [i * i], [i % 2]] if i % 3 else [[]] + for i in range(12345) + ], + "ld": [ + [dec(i), dec(i / 2)] if i % 5 else [] for i in range(12345) + ], }, # variable list lengths { @@ -1331,7 +1374,9 @@ def test_orc_writer_lists(data): pdf_in = pd.DataFrame(data) buffer = BytesIO() - cudf.from_pandas(pdf_in).to_orc(buffer, stripe_size_rows=2048, row_index_stride=512) + cudf.from_pandas(pdf_in).to_orc( + buffer, stripe_size_rows=2048, row_index_stride=512 + ) pdf_out = pd.read_orc(buffer) assert_eq(pdf_out, pdf_in) @@ -1453,7 +1498,9 @@ def test_statistics_sum_overflow(): minint64 = np.iinfo(np.int64).min buff = BytesIO() - df = pd.DataFrame({"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]}) + df = pd.DataFrame( + {"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]} + ) df.to_orc(buff) file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) @@ -1616,7 +1663,9 @@ def test_orc_writer_nvcomp(compression): def run_orc_columns_and_index_param(index_obj, index, columns): buffer = BytesIO() - df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=index_obj) + df = cudf.DataFrame( + {"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=index_obj + ) df.to_orc(buffer, index=index) expected = pd.read_orc(buffer, columns=columns) @@ -1744,7 +1793,9 @@ def test_orc_writer_cols_as_map_type(df_data, cols_as_map_type, expected_data): def test_orc_writer_cols_as_map_type_error(): - df = cudf.DataFrame({"a": cudf.Series([[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]])}) + df = cudf.DataFrame( + {"a": cudf.Series([[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]])} + ) buffer = BytesIO() with pytest.raises( TypeError, match="cols_as_map_type must be a list of column names." @@ -1874,11 +1925,15 @@ def test_orc_chunked_writer_stripe_size(datadir): orc_file = orc.ORCFile(buffer) assert_eq(orc_file.nstripes, 10) + got = cudf.read_orc(buffer) buffer = BytesIO() - writer = ORCWriter(buffer, stripe_size_rows=20000) + writer = ORCWriter(buffer, stripe_size_rows=20000, row_index_stride=1000) writer.write_table(df) writer.close() orc_file = orc.ORCFile(buffer) assert_eq(orc_file.nstripes, 5) + got = cudf.read_orc(buffer) + + assert_eq(1, 2) From b92fb95bcff43fe66bd5996ea88428e72e7a89bb Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 18 Jan 2024 12:53:09 -0800 Subject: [PATCH 5/5] revert debug changes --- python/cudf/cudf/tests/test_orc.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 3bc68a3c329..6b7f86098a0 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1925,15 +1925,11 @@ def test_orc_chunked_writer_stripe_size(datadir): orc_file = orc.ORCFile(buffer) assert_eq(orc_file.nstripes, 10) - got = cudf.read_orc(buffer) buffer = BytesIO() - writer = ORCWriter(buffer, stripe_size_rows=20000, row_index_stride=1000) + writer = ORCWriter(buffer, stripe_size_rows=20000) writer.write_table(df) writer.close() orc_file = orc.ORCFile(buffer) assert_eq(orc_file.nstripes, 5) - got = cudf.read_orc(buffer) - - assert_eq(1, 2)