From b1f60d82cbe66ab67ba57b29898cc9e14573c07a Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 17 Jan 2024 14:59:23 -0800
Subject: [PATCH 1/5] add stripe size support to chunked orc writer

---
 python/cudf/cudf/_lib/orc.pyx | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 0ae039b14d2..663cbeadf7f 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import cudf
 from cudf.core.buffer import acquire_spill_lock
@@ -376,13 +376,17 @@ cdef class ORCWriter:
     cdef object index
     cdef table_input_metadata tbl_meta
     cdef object cols_as_map_type
+    cdef object stripe_size_bytes
+    cdef object stripe_size_rows
 
     def __cinit__(self,
                   object path,
                   object index=None,
                   object compression="snappy",
                   object statistics="ROWGROUP",
-                  object cols_as_map_type=None):
+                  object cols_as_map_type=None,
+                  object stripe_size_bytes=None,
+                  object stripe_size_rows=None):
 
         self.sink = make_sink_info(path, self._data_sink)
         self.stat_freq = _get_orc_stat_freq(statistics)
@@ -390,6 +394,8 @@ cdef class ORCWriter:
         self.index = index
         self.cols_as_map_type = cols_as_map_type \
             if cols_as_map_type is None else set(cols_as_map_type)
+        self.stripe_size_bytes = stripe_size_bytes
+        self.stripe_size_rows = stripe_size_rows
         self.initialized = False
 
     def write_table(self, table):
@@ -457,9 +463,7 @@ cdef class ORCWriter:
         pandas_metadata = generate_pandas_metadata(table, self.index)
         user_data[str.encode("pandas")] = str.encode(pandas_metadata)
 
-        cdef chunked_orc_writer_options args
-        with nogil:
-            args = move(
+        cdef chunked_orc_writer_options c_opts = move(
                 chunked_orc_writer_options.builder(self.sink)
                 .metadata(self.tbl_meta)
                 .key_value_metadata(move(user_data))
@@ -467,7 +471,13 @@ cdef class ORCWriter:
                 .enable_statistics(self.stat_freq)
                 .build()
             )
-            self.writer.reset(new orc_chunked_writer(args))
+        if self.stripe_size_bytes is not None:
+            c_opts.set_stripe_size_bytes(self.stripe_size_bytes)
+        if self.stripe_size_rows is not None:
+            c_opts.set_stripe_size_rows(self.stripe_size_rows)
+
+        with nogil:
+            self.writer.reset(new orc_chunked_writer(c_opts))
 
         self.initialized = True
 

From 98c09a44f07f42552c3095fdebd8802df9a9d425 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 18 Jan 2024 12:40:11 -0800
Subject: [PATCH 2/5] tests

---
 python/cudf/cudf/tests/test_orc.py | 121 +++++++++++------------------
 1 file changed, 46 insertions(+), 75 deletions(-)

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 4630b6eef0a..1d8a95faa65 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -93,9 +93,7 @@ def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine):
     path = datadir / inputfile
 
     expect = pd.read_orc(path, columns=columns)
-    got = cudf.read_orc(
-        path, engine=engine, columns=columns, use_index=use_index
-    )
+    got = cudf.read_orc(path, engine=engine, columns=columns, use_index=use_index)
 
     assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False)
 
@@ -116,9 +114,7 @@ def test_orc_reader_local_filepath():
     cudf.read_orc(path)
 
 
-@pytest.mark.parametrize(
-    "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"]
-)
+@pytest.mark.parametrize("src", ["filepath", "pathobj", "bytes_io", "bytes", "url"])
 def test_orc_reader_filepath_or_buffer(path_or_buf, src):
     cols = ["int1", "long1", "float1", "double1"]
 
@@ -253,17 +249,12 @@ def test_orc_read_stripes(datadir, engine):
     num_rows, stripes, col_names = cudf.io.read_orc_metadata(path)
 
     # Read stripes one at a time
-    gdf = [
-        cudf.read_orc(path, engine=engine, stripes=[[i]])
-        for i in range(stripes)
-    ]
+    gdf = [cudf.read_orc(path, engine=engine, stripes=[[i]]) for i in range(stripes)]
     gdf = cudf.concat(gdf).reset_index(drop=True)
     assert_eq(pdf, gdf, check_categorical=False, check_index_type=True)
 
     # Read stripes all at once
-    gdf = cudf.read_orc(
-        path, engine=engine, stripes=[[int(x) for x in range(stripes)]]
-    )
+    gdf = cudf.read_orc(path, engine=engine, stripes=[[int(x) for x in range(stripes)]])
     assert_eq(pdf, gdf, check_categorical=False)
 
     # Read only some stripes
@@ -322,9 +313,7 @@ def test_orc_read_skiprows():
     # repro for other sizes of data
     skiprows = 10
 
-    expected = (
-        pd.read_orc(buff)[skiprows:].reset_index(drop=True).astype("bool")
-    )
+    expected = pd.read_orc(buff)[skiprows:].reset_index(drop=True).astype("bool")
     got = cudf.read_orc(buff, skiprows=skiprows)
     assert_eq(expected, got)
 
@@ -438,9 +427,7 @@ def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
         ("TestOrcFile.demo-12-zlib.orc", ["_col1", "_col3", "_col5"]),
     ],
 )
-def test_chunked_orc_writer(
-    datadir, tmpdir, reference_file, columns, compression
-):
+def test_chunked_orc_writer(datadir, tmpdir, reference_file, columns, compression):
     pdf_fname = datadir / reference_file
     gdf_fname = tmpdir.join("chunked_gdf.orc")
 
@@ -669,9 +656,7 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
                     assert normalized_equals(actual_max, stats_max)
 
             if "number_of_values" in stripes_stats[stripe_idx][col]:
-                stats_num_vals = stripes_stats[stripe_idx][col][
-                    "number_of_values"
-                ]
+                stats_num_vals = stripes_stats[stripe_idx][col]["number_of_values"]
                 if stats_num_vals is not None:
                     actual_num_vals = stripe_df[col].count()
                     assert stats_num_vals == actual_num_vals
@@ -777,9 +762,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
                     assert normalized_equals(actual_max, stats_max)
 
             if "number_of_values" in stripes_stats[stripe_idx][col]:
-                stats_num_vals = stripes_stats[stripe_idx][col][
-                    "number_of_values"
-                ]
+                stats_num_vals = stripes_stats[stripe_idx][col]["number_of_values"]
                 if stats_num_vals is not None:
                     actual_num_vals = stripe_df[col].count()
                     assert stats_num_vals == actual_num_vals
@@ -827,12 +810,8 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows):
             assert normalized_equals(actual_true_count, stats_true_count)
 
         if "number_of_values" in stripes_stats[stripe_idx][col]:
-            actual_valid_count = (
-                len(stripe_df[col]) - stripe_df[col].null_count
-            )
-            stats_valid_count = stripes_stats[stripe_idx][col][
-                "number_of_values"
-            ]
+            actual_valid_count = len(stripe_df[col]) - stripe_df[col].null_count
+            stats_valid_count = stripes_stats[stripe_idx][col]["number_of_values"]
             assert normalized_equals(actual_valid_count, stats_valid_count)
 
 
@@ -901,9 +880,7 @@ def test_empty_dataframe():
     assert_eq(expected_pdf, got_df)
 
 
-@pytest.mark.parametrize(
-    "data", [[None, ""], ["", None], [None, None], ["", ""]]
-)
+@pytest.mark.parametrize("data", [[None, ""], ["", None], [None, None], ["", ""]])
 def test_empty_string_columns(data):
     buffer = BytesIO()
 
@@ -1146,9 +1123,7 @@ def gen_map_buff(size=10000):
                 [
                     None,
                     {
-                        rd.choice(al): rd.choice(
-                            [None, np.random.randint(1, 1500)]
-                        ),
+                        rd.choice(al): rd.choice([None, np.random.randint(1, 1500)]),
                     },
                 ]
             )
@@ -1167,9 +1142,7 @@ def gen_map_buff(size=10000):
                                 [
                                     None,
                                     [
-                                        rd.choice(
-                                            [None, np.random.randint(1, 1500)]
-                                        )
+                                        rd.choice([None, np.random.randint(1, 1500)])
                                         for _ in range(5)
                                     ],
                                 ]
@@ -1210,9 +1183,7 @@ def gen_map_buff(size=10000):
             )
             for _ in range(size)
         ],
-        type=pa.map_(
-            pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()})
-        ),
+        type=pa.map_(pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()})),
     )
 
     pa_table = pa.Table.from_arrays(
@@ -1220,9 +1191,7 @@ def gen_map_buff(size=10000):
         ["lvl1_map", "lvl2_map", "lvl2_struct_map"],
     )
 
-    orc.write_table(
-        pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED"
-    )
+    orc.write_table(pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED")
 
     return buff
 
@@ -1249,11 +1218,7 @@ def test_map_type_read(columns, num_rows, use_index):
     lvl2_map = (
         tbl["lvl2_map"]
         .combine_chunks()
-        .view(
-            pa.list_(
-                pa.struct({"key": pa.string(), "value": pa.list_(pa.int64())})
-            )
-        )
+        .view(pa.list_(pa.struct({"key": pa.string(), "value": pa.list_(pa.int64())})))
     )
     lvl2_struct_map = (
         tbl["lvl2_struct_map"]
@@ -1344,21 +1309,13 @@ def dec(num):
         },
         # with empty elements
         {
-            "ls": [
-                [str(i), str(2 * i)] if i % 2 else [] for i in range(12345)
-            ],
+            "ls": [[str(i), str(2 * i)] if i % 2 else [] for i in range(12345)],
             "lls": [
-                [[str(i), str(2 * i)]] if i % 2 else [[], []]
-                for i in range(12345)
+                [[str(i), str(2 * i)]] if i % 2 else [[], []] for i in range(12345)
             ],
             "li": [[i, i * i, i % 2] if i % 3 else [] for i in range(12345)],
-            "lli": [
-                [[i], [i * i], [i % 2]] if i % 3 else [[]]
-                for i in range(12345)
-            ],
-            "ld": [
-                [dec(i), dec(i / 2)] if i % 5 else [] for i in range(12345)
-            ],
+            "lli": [[[i], [i * i], [i % 2]] if i % 3 else [[]] for i in range(12345)],
+            "ld": [[dec(i), dec(i / 2)] if i % 5 else [] for i in range(12345)],
         },
         # variable list lengths
         {
@@ -1374,9 +1331,7 @@ def test_orc_writer_lists(data):
     pdf_in = pd.DataFrame(data)
 
     buffer = BytesIO()
-    cudf.from_pandas(pdf_in).to_orc(
-        buffer, stripe_size_rows=2048, row_index_stride=512
-    )
+    cudf.from_pandas(pdf_in).to_orc(buffer, stripe_size_rows=2048, row_index_stride=512)
 
     pdf_out = pd.read_orc(buffer)
     assert_eq(pdf_out, pdf_in)
@@ -1498,9 +1453,7 @@ def test_statistics_sum_overflow():
     minint64 = np.iinfo(np.int64).min
 
     buff = BytesIO()
-    df = pd.DataFrame(
-        {"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]}
-    )
+    df = pd.DataFrame({"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]})
     df.to_orc(buff)
 
     file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff])
@@ -1663,9 +1616,7 @@ def test_orc_writer_nvcomp(compression):
 
 def run_orc_columns_and_index_param(index_obj, index, columns):
     buffer = BytesIO()
-    df = cudf.DataFrame(
-        {"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=index_obj
-    )
+    df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=index_obj)
     df.to_orc(buffer, index=index)
 
     expected = pd.read_orc(buffer, columns=columns)
@@ -1793,9 +1744,7 @@ def test_orc_writer_cols_as_map_type(df_data, cols_as_map_type, expected_data):
 
 
 def test_orc_writer_cols_as_map_type_error():
-    df = cudf.DataFrame(
-        {"a": cudf.Series([[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]])}
-    )
+    df = cudf.DataFrame({"a": cudf.Series([[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]])})
     buffer = BytesIO()
     with pytest.raises(
         TypeError, match="cols_as_map_type must be a list of column names."
@@ -1911,3 +1860,25 @@ def test_orc_reader_empty_deeply_nested_level(datadir):
     got = cudf.read_orc(path)
 
     assert_eq(expect, got)
+
+
+def test_orc_chunked_writer_stripe_size(datadir):
+    from pyarrow import orc
+
+    df = cudf.DataFrame({"col": gen_rand_series("int", 100000)})
+
+    buffer = BytesIO()
+    writer = ORCWriter(buffer, stripe_size_bytes=64 * 1024)
+    writer.write_table(df)
+    writer.close()
+
+    orc_file = orc.ORCFile(buffer)
+    assert_eq(orc_file.nstripes, 10)
+
+    buffer = BytesIO()
+    writer = ORCWriter(buffer, stripe_size_rows=20000)
+    writer.write_table(df)
+    writer.close()
+
+    orc_file = orc.ORCFile(buffer)
+    assert_eq(orc_file.nstripes, 5)

From 83201e9a9b38d07078945efcf750e690cb3a4702 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 18 Jan 2024 12:46:05 -0800
Subject: [PATCH 3/5] row index stride

---
 python/cudf/cudf/_lib/orc.pyx | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 663cbeadf7f..49d93402c82 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -378,6 +378,7 @@ cdef class ORCWriter:
     cdef object cols_as_map_type
     cdef object stripe_size_bytes
     cdef object stripe_size_rows
+    cdef object row_index_stride
 
     def __cinit__(self,
                   object path,
@@ -386,7 +387,8 @@ cdef class ORCWriter:
                   object statistics="ROWGROUP",
                   object cols_as_map_type=None,
                   object stripe_size_bytes=None,
-                  object stripe_size_rows=None):
+                  object stripe_size_rows=None,
+                  object row_index_stride=None):
 
         self.sink = make_sink_info(path, self._data_sink)
         self.stat_freq = _get_orc_stat_freq(statistics)
@@ -396,6 +398,7 @@ cdef class ORCWriter:
             if cols_as_map_type is None else set(cols_as_map_type)
         self.stripe_size_bytes = stripe_size_bytes
         self.stripe_size_rows = stripe_size_rows
+        self.row_index_stride = row_index_stride
         self.initialized = False
 
     def write_table(self, table):
@@ -475,6 +478,8 @@ cdef class ORCWriter:
             c_opts.set_stripe_size_bytes(self.stripe_size_bytes)
         if self.stripe_size_rows is not None:
             c_opts.set_stripe_size_rows(self.stripe_size_rows)
+        if self.row_index_stride is not None:
+            c_opts.set_row_index_stride(self.row_index_stride)
 
         with nogil:
             self.writer.reset(new orc_chunked_writer(c_opts))

From 0b124efac194dc1ae3315c440548523a222bb439 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 18 Jan 2024 12:49:26 -0800
Subject: [PATCH 4/5] style

---
 python/cudf/cudf/tests/test_orc.py | 105 ++++++++++++++++++++++-------
 1 file changed, 80 insertions(+), 25 deletions(-)

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 1d8a95faa65..3bc68a3c329 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -93,7 +93,9 @@ def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine):
     path = datadir / inputfile
 
     expect = pd.read_orc(path, columns=columns)
-    got = cudf.read_orc(path, engine=engine, columns=columns, use_index=use_index)
+    got = cudf.read_orc(
+        path, engine=engine, columns=columns, use_index=use_index
+    )
 
     assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False)
 
@@ -114,7 +116,9 @@ def test_orc_reader_local_filepath():
     cudf.read_orc(path)
 
 
-@pytest.mark.parametrize("src", ["filepath", "pathobj", "bytes_io", "bytes", "url"])
+@pytest.mark.parametrize(
+    "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"]
+)
 def test_orc_reader_filepath_or_buffer(path_or_buf, src):
     cols = ["int1", "long1", "float1", "double1"]
 
@@ -249,12 +253,17 @@ def test_orc_read_stripes(datadir, engine):
     num_rows, stripes, col_names = cudf.io.read_orc_metadata(path)
 
     # Read stripes one at a time
-    gdf = [cudf.read_orc(path, engine=engine, stripes=[[i]]) for i in range(stripes)]
+    gdf = [
+        cudf.read_orc(path, engine=engine, stripes=[[i]])
+        for i in range(stripes)
+    ]
     gdf = cudf.concat(gdf).reset_index(drop=True)
     assert_eq(pdf, gdf, check_categorical=False, check_index_type=True)
 
     # Read stripes all at once
-    gdf = cudf.read_orc(path, engine=engine, stripes=[[int(x) for x in range(stripes)]])
+    gdf = cudf.read_orc(
+        path, engine=engine, stripes=[[int(x) for x in range(stripes)]]
+    )
     assert_eq(pdf, gdf, check_categorical=False)
 
     # Read only some stripes
@@ -313,7 +322,9 @@ def test_orc_read_skiprows():
     # repro for other sizes of data
     skiprows = 10
 
-    expected = pd.read_orc(buff)[skiprows:].reset_index(drop=True).astype("bool")
+    expected = (
+        pd.read_orc(buff)[skiprows:].reset_index(drop=True).astype("bool")
+    )
     got = cudf.read_orc(buff, skiprows=skiprows)
     assert_eq(expected, got)
 
@@ -427,7 +438,9 @@ def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
         ("TestOrcFile.demo-12-zlib.orc", ["_col1", "_col3", "_col5"]),
     ],
 )
-def test_chunked_orc_writer(datadir, tmpdir, reference_file, columns, compression):
+def test_chunked_orc_writer(
+    datadir, tmpdir, reference_file, columns, compression
+):
     pdf_fname = datadir / reference_file
     gdf_fname = tmpdir.join("chunked_gdf.orc")
 
@@ -656,7 +669,9 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
                     assert normalized_equals(actual_max, stats_max)
 
             if "number_of_values" in stripes_stats[stripe_idx][col]:
-                stats_num_vals = stripes_stats[stripe_idx][col]["number_of_values"]
+                stats_num_vals = stripes_stats[stripe_idx][col][
+                    "number_of_values"
+                ]
                 if stats_num_vals is not None:
                     actual_num_vals = stripe_df[col].count()
                     assert stats_num_vals == actual_num_vals
@@ -762,7 +777,9 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
                     assert normalized_equals(actual_max, stats_max)
 
             if "number_of_values" in stripes_stats[stripe_idx][col]:
-                stats_num_vals = stripes_stats[stripe_idx][col]["number_of_values"]
+                stats_num_vals = stripes_stats[stripe_idx][col][
+                    "number_of_values"
+                ]
                 if stats_num_vals is not None:
                     actual_num_vals = stripe_df[col].count()
                     assert stats_num_vals == actual_num_vals
@@ -810,8 +827,12 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows):
             assert normalized_equals(actual_true_count, stats_true_count)
 
         if "number_of_values" in stripes_stats[stripe_idx][col]:
-            actual_valid_count = len(stripe_df[col]) - stripe_df[col].null_count
-            stats_valid_count = stripes_stats[stripe_idx][col]["number_of_values"]
+            actual_valid_count = (
+                len(stripe_df[col]) - stripe_df[col].null_count
+            )
+            stats_valid_count = stripes_stats[stripe_idx][col][
+                "number_of_values"
+            ]
             assert normalized_equals(actual_valid_count, stats_valid_count)
 
 
@@ -880,7 +901,9 @@ def test_empty_dataframe():
     assert_eq(expected_pdf, got_df)
 
 
-@pytest.mark.parametrize("data", [[None, ""], ["", None], [None, None], ["", ""]])
+@pytest.mark.parametrize(
+    "data", [[None, ""], ["", None], [None, None], ["", ""]]
+)
 def test_empty_string_columns(data):
     buffer = BytesIO()
 
@@ -1123,7 +1146,9 @@ def gen_map_buff(size=10000):
                 [
                     None,
                     {
-                        rd.choice(al): rd.choice([None, np.random.randint(1, 1500)]),
+                        rd.choice(al): rd.choice(
+                            [None, np.random.randint(1, 1500)]
+                        ),
                     },
                 ]
             )
@@ -1142,7 +1167,9 @@ def gen_map_buff(size=10000):
                                 [
                                     None,
                                     [
-                                        rd.choice([None, np.random.randint(1, 1500)])
+                                        rd.choice(
+                                            [None, np.random.randint(1, 1500)]
+                                        )
                                         for _ in range(5)
                                     ],
                                 ]
@@ -1183,7 +1210,9 @@ def gen_map_buff(size=10000):
             )
             for _ in range(size)
         ],
-        type=pa.map_(pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()})),
+        type=pa.map_(
+            pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()})
+        ),
     )
 
     pa_table = pa.Table.from_arrays(
@@ -1191,7 +1220,9 @@ def gen_map_buff(size=10000):
         ["lvl1_map", "lvl2_map", "lvl2_struct_map"],
     )
 
-    orc.write_table(pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED")
+    orc.write_table(
+        pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED"
+    )
 
     return buff
 
@@ -1218,7 +1249,11 @@ def test_map_type_read(columns, num_rows, use_index):
     lvl2_map = (
         tbl["lvl2_map"]
         .combine_chunks()
-        .view(pa.list_(pa.struct({"key": pa.string(), "value": pa.list_(pa.int64())})))
+        .view(
+            pa.list_(
+                pa.struct({"key": pa.string(), "value": pa.list_(pa.int64())})
+            )
+        )
     )
     lvl2_struct_map = (
         tbl["lvl2_struct_map"]
@@ -1309,13 +1344,21 @@ def dec(num):
         },
         # with empty elements
         {
-            "ls": [[str(i), str(2 * i)] if i % 2 else [] for i in range(12345)],
+            "ls": [
+                [str(i), str(2 * i)] if i % 2 else [] for i in range(12345)
+            ],
             "lls": [
-                [[str(i), str(2 * i)]] if i % 2 else [[], []] for i in range(12345)
+                [[str(i), str(2 * i)]] if i % 2 else [[], []]
+                for i in range(12345)
             ],
             "li": [[i, i * i, i % 2] if i % 3 else [] for i in range(12345)],
-            "lli": [[[i], [i * i], [i % 2]] if i % 3 else [[]] for i in range(12345)],
-            "ld": [[dec(i), dec(i / 2)] if i % 5 else [] for i in range(12345)],
+            "lli": [
+                [[i], [i * i], [i % 2]] if i % 3 else [[]]
+                for i in range(12345)
+            ],
+            "ld": [
+                [dec(i), dec(i / 2)] if i % 5 else [] for i in range(12345)
+            ],
         },
         # variable list lengths
         {
@@ -1331,7 +1374,9 @@ def test_orc_writer_lists(data):
     pdf_in = pd.DataFrame(data)
 
     buffer = BytesIO()
-    cudf.from_pandas(pdf_in).to_orc(buffer, stripe_size_rows=2048, row_index_stride=512)
+    cudf.from_pandas(pdf_in).to_orc(
+        buffer, stripe_size_rows=2048, row_index_stride=512
+    )
 
     pdf_out = pd.read_orc(buffer)
     assert_eq(pdf_out, pdf_in)
@@ -1453,7 +1498,9 @@ def test_statistics_sum_overflow():
     minint64 = np.iinfo(np.int64).min
 
     buff = BytesIO()
-    df = pd.DataFrame({"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]})
+    df = pd.DataFrame(
+        {"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]}
+    )
     df.to_orc(buff)
 
     file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff])
@@ -1616,7 +1663,9 @@ def test_orc_writer_nvcomp(compression):
 
 def run_orc_columns_and_index_param(index_obj, index, columns):
     buffer = BytesIO()
-    df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=index_obj)
+    df = cudf.DataFrame(
+        {"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=index_obj
+    )
     df.to_orc(buffer, index=index)
 
     expected = pd.read_orc(buffer, columns=columns)
@@ -1744,7 +1793,9 @@ def test_orc_writer_cols_as_map_type(df_data, cols_as_map_type, expected_data):
 
 
 def test_orc_writer_cols_as_map_type_error():
-    df = cudf.DataFrame({"a": cudf.Series([[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]])})
+    df = cudf.DataFrame(
+        {"a": cudf.Series([[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]])}
+    )
     buffer = BytesIO()
     with pytest.raises(
         TypeError, match="cols_as_map_type must be a list of column names."
@@ -1874,11 +1925,15 @@ def test_orc_chunked_writer_stripe_size(datadir):
 
     orc_file = orc.ORCFile(buffer)
     assert_eq(orc_file.nstripes, 10)
+    got = cudf.read_orc(buffer)
 
     buffer = BytesIO()
-    writer = ORCWriter(buffer, stripe_size_rows=20000)
+    writer = ORCWriter(buffer, stripe_size_rows=20000, row_index_stride=1000)
     writer.write_table(df)
     writer.close()
 
     orc_file = orc.ORCFile(buffer)
     assert_eq(orc_file.nstripes, 5)
+    got = cudf.read_orc(buffer)
+
+    assert_eq(1, 2)

From b92fb95bcff43fe66bd5996ea88428e72e7a89bb Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 18 Jan 2024 12:53:09 -0800
Subject: [PATCH 5/5] revert debug changes

---
 python/cudf/cudf/tests/test_orc.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 3bc68a3c329..6b7f86098a0 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1925,15 +1925,11 @@ def test_orc_chunked_writer_stripe_size(datadir):
 
     orc_file = orc.ORCFile(buffer)
     assert_eq(orc_file.nstripes, 10)
-    got = cudf.read_orc(buffer)
 
     buffer = BytesIO()
-    writer = ORCWriter(buffer, stripe_size_rows=20000, row_index_stride=1000)
+    writer = ORCWriter(buffer, stripe_size_rows=20000)
     writer.write_table(df)
     writer.close()
 
     orc_file = orc.ORCFile(buffer)
     assert_eq(orc_file.nstripes, 5)
-    got = cudf.read_orc(buffer)
-
-    assert_eq(1, 2)