From 90f89a19ee93ee71e71a435942411f97a6b6640a Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Sat, 4 Sep 2021 17:15:57 -0700
Subject: [PATCH 1/7] add initial datagenerator struct support

---
 python/cudf/cudf/testing/dataset_generator.py | 74 +++++++++++++++++--
 1 file changed, 66 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index cdea22a05af..080faea71d1 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -352,6 +352,25 @@ def rand_dataframe(
                     dtype=dtype,
                 )
             )
+        elif dtype == "struct":
+            nesting_max_depth = meta["nesting_max_depth"]
+            nesting_depth = np.random.randint(1, nesting_max_depth)
+
+            # TODO: Fix me
+            # column_params.append(
+            #     ColumnParameters(
+            #         cardinality=cardinality,
+            #         null_frequency=null_frequency,
+            #         generator=list_generator(
+            #             dtype=value_type,
+            #             size=cardinality,
+            #             nesting_depth=nesting_depth,
+            #             lists_max_length=lists_max_length,
+            #         ),
+            #         is_sorted=False,
+            #         dtype=dtype,
+            #     )
+            # )
         elif dtype == "decimal64":
             max_precision = meta.get(
                 "max_precision", cudf.Decimal64Dtype.MAX_PRECISION
@@ -535,11 +554,15 @@ def decimal_generator(dtype, size):
     )
 
 
-def get_values_for_nested_data(dtype, lists_max_length):
+def get_values_for_nested_data(dtype, lists_max_length=None, size=None):
     """
     Returns list of values based on dtype.
     """
-    cardinality = np.random.randint(0, lists_max_length)
+    if size is None:
+        cardinality = np.random.randint(0, lists_max_length)
+    else:
+        cardinality = size
+
     dtype = cudf.dtype(dtype)
     if dtype.kind in ("i", "u"):
         values = int_generator(dtype=dtype, size=cardinality)()
@@ -563,12 +586,7 @@ def get_values_for_nested_data(dtype, lists_max_length):
     else:
         raise TypeError(f"Unsupported dtype: {dtype}")
 
-    # To ensure numpy arrays are not passed as input to
-    # list constructor, returning a python list object here.
-    if isinstance(values, np.ndarray):
-        return values.tolist()
-    else:
-        return values
+    return values
 
 
 def make_lists(dtype, lists_max_length, nesting_depth, top_level_list):
@@ -592,9 +610,26 @@ def make_lists(dtype, lists_max_length, nesting_depth, top_level_list):
         top_level_list = get_values_for_nested_data(
             dtype=dtype, lists_max_length=lists_max_length
         )
+        # To ensure numpy arrays are not passed as input to
+        # list constructor, returning a python list object here.
+        if isinstance(top_level_list, np.ndarray):
+            top_level_list = top_level_list.tolist()
+
     return top_level_list
 
 
+def make_array_for_struct(dtype, size):
+    """
+    Helper to create a pa.array with `size` and `dtype`
+    for a `StructArray`.
+    """
+
+    data = get_values_for_nested_data(
+        dtype=dtype.type.to_pandas_dtype(), size=size
+    )
+    return pa.array(data, type=dtype.type)
+
+
 def get_nested_lists(dtype, size, nesting_depth, lists_max_length):
     """
     Returns a list of nested lists with random nesting
@@ -615,6 +650,22 @@ def get_nested_lists(dtype, size, nesting_depth, lists_max_length):
     return list_of_lists
 
 
+def get_nested_structs(dtype, size):
+    """
+    Returns a list of arrays with random data
+    corresponding to the dtype provided.
+    ``dtype`` here should be a ``cudf.StructDtype``
+    """
+    list_of_arrays = []
+
+    for name, col_dtype in dtype.fields.items():
+        list_of_arrays.append(
+            make_array_for_struct(dtype=dtype._typ[name], size=size)
+        )
+
+    return list_of_arrays
+
+
 def list_generator(dtype, size, nesting_depth, lists_max_length):
     """
     Generator for list data
@@ -625,3 +676,10 @@ def list_generator(dtype, size, nesting_depth, lists_max_length):
         nesting_depth=nesting_depth,
         lists_max_length=lists_max_length,
     )
+
+
+def struct_generator(dtype, size):
+    """
+    Generator for struct data
+    """
+    return lambda: get_nested_structs(dtype=dtype, size=size,)

From ab6b0f2a8e7e6b6d06ff48bded236ea931f3cf87 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 7 Sep 2021 23:23:41 -0500
Subject: [PATCH 2/7] implementation details

---
 python/cudf/cudf/_fuzz_testing/io.py          |   6 +
 python/cudf/cudf/_fuzz_testing/main.py        |   4 +-
 python/cudf/cudf/_fuzz_testing/parquet.py     |   4 +
 python/cudf/cudf/_fuzz_testing/utils.py       |  20 +++
 python/cudf/cudf/testing/dataset_generator.py | 123 ++++++++++++++----
 5 files changed, 129 insertions(+), 28 deletions(-)

diff --git a/python/cudf/cudf/_fuzz_testing/io.py b/python/cudf/cudf/_fuzz_testing/io.py
index 1312300f714..193fb4c7f7f 100644
--- a/python/cudf/cudf/_fuzz_testing/io.py
+++ b/python/cudf/cudf/_fuzz_testing/io.py
@@ -25,6 +25,9 @@ def __init__(
         max_string_length=None,
         max_lists_length=None,
         max_lists_nesting_depth=None,
+        max_structs_nesting_depth=None,
+        max_struct_null_frequency=None,
+        max_struct_types_at_each_level=None,
     ):
         dirs = [] if dirs is None else dirs
         self._inputs = []
@@ -33,6 +36,9 @@ def __init__(
         self._max_string_length = max_string_length
         self._max_lists_length = max_lists_length
         self._max_lists_nesting_depth = max_lists_nesting_depth
+        self._max_structs_nesting_depth = max_structs_nesting_depth
+        self._max_struct_null_frequency = max_struct_null_frequency
+        self._max_struct_types_at_each_level = max_struct_types_at_each_level
 
         for i, path in enumerate(dirs):
             if i == 0 and not os.path.exists(path):
diff --git a/python/cudf/cudf/_fuzz_testing/main.py b/python/cudf/cudf/_fuzz_testing/main.py
index 7b28a4c4970..e7c0c326676 100644
--- a/python/cudf/cudf/_fuzz_testing/main.py
+++ b/python/cudf/cudf/_fuzz_testing/main.py
@@ -13,8 +13,8 @@ def __init__(self, func, params=None, data_handle=None, **kwargs):
             dirs=kwargs.get("dir", None),
             crash_reports_dir=kwargs.get("crash_reports_dir", None),
             regression=kwargs.get("regression", False),
-            max_rows_size=kwargs.get("max_rows_size", 100_000),
-            max_cols_size=kwargs.get("max_cols_size", 1000),
+            max_rows_size=kwargs.get("max_rows_size", 100),
+            max_cols_size=kwargs.get("max_cols_size", 50),
             runs=kwargs.get("runs", -1),
             max_string_length=kwargs.get("max_string_length", None),
             params=params,
diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py
index 5b00f96d88d..ecad18acc81 100644
--- a/python/cudf/cudf/_fuzz_testing/parquet.py
+++ b/python/cudf/cudf/_fuzz_testing/parquet.py
@@ -59,6 +59,7 @@ def generate_input(self):
                 - {"uint32"}
                 | {"list", "decimal64"}
             )
+            dtypes_list = ["struct"]
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                 self, dtypes_list
             )
@@ -80,6 +81,9 @@ def generate_input(self):
         # https://issues.apache.org/jira/browse/ARROW-10123
 
         # file = io.BytesIO()
+        import pdb
+
+        pdb.set_trace()
         df.to_parquet("temp_file")
         # file.seek(0)
         # self._current_buffer = copy.copy(file.read())
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 0e68f1c71cc..9949d1068c7 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -114,6 +114,26 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
             meta["value_type"] = random.choice(
                 list(cudf.utils.dtypes.ALL_TYPES - {"category"})
             )
+        elif dtype == "struct":
+            if obj._max_lists_nesting_depth is None:
+                meta["nesting_max_depth"] = np.random.randint(2, 10)
+            else:
+                meta["nesting_max_depth"] = obj._max_lists_nesting_depth
+
+            if obj._max_struct_null_frequency is None:
+                meta["max_null_frequency"] = random.uniform(0, 1)
+            else:
+                meta["max_null_frequency"] = obj._max_struct_null_frequency
+
+            if obj._max_struct_types_at_each_level is None:
+                meta["max_types_at_each_level"] = np.random.randint(
+                    low=1, high=10
+                )
+            else:
+                meta[
+                    "max_types_at_each_level"
+                ] = obj._max_struct_types_at_each_level
+
         elif dtype == "decimal64":
             meta["max_precision"] = cudf.Decimal64Dtype.MAX_PRECISION
 
diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index 080faea71d1..5557d9d64b9 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -133,7 +133,25 @@ def _generate_column(column_params, num_rows):
         else:
             arrow_type = None
 
-        if not isinstance(arrow_type, pa.lib.Decimal128Type):
+        if isinstance(column_params.dtype, cudf.StructDtype):
+            vals = pa.StructArray.from_arrays(
+                column_params.generator,
+                names=column_params.dtype.fields.keys(),
+                mask=pa.array(
+                    np.random.choice(
+                        [True, False],
+                        size=num_rows,
+                        p=[
+                            column_params.null_frequency,
+                            1 - column_params.null_frequency,
+                        ],
+                    )
+                )
+                if column_params.null_frequency > 0.0
+                else None,
+            )
+            return vals
+        elif not isinstance(arrow_type, pa.lib.Decimal128Type):
             vals = pa.array(
                 column_params.generator,
                 size=column_params.cardinality,
@@ -354,23 +372,28 @@ def rand_dataframe(
             )
         elif dtype == "struct":
             nesting_max_depth = meta["nesting_max_depth"]
+            max_types_at_each_level = meta["max_types_at_each_level"]
+            max_null_frequency = meta["max_null_frequency"]
             nesting_depth = np.random.randint(1, nesting_max_depth)
+            structDtype = create_nested_struct_type(
+                max_types_at_each_level=max_types_at_each_level,
+                nesting_level=nesting_depth,
+            )
 
-            # TODO: Fix me
-            # column_params.append(
-            #     ColumnParameters(
-            #         cardinality=cardinality,
-            #         null_frequency=null_frequency,
-            #         generator=list_generator(
-            #             dtype=value_type,
-            #             size=cardinality,
-            #             nesting_depth=nesting_depth,
-            #             lists_max_length=lists_max_length,
-            #         ),
-            #         is_sorted=False,
-            #         dtype=dtype,
-            #     )
-            # )
+            column_params.append(
+                ColumnParameters(
+                    cardinality=cardinality,
+                    null_frequency=null_frequency,
+                    generator=struct_generator(
+                        dtype=structDtype,
+                        cardinality=cardinality,
+                        size=rows,
+                        max_null_frequency=max_null_frequency,
+                    ),
+                    is_sorted=False,
+                    dtype=structDtype,
+                )
+            )
         elif dtype == "decimal64":
             max_precision = meta.get(
                 "max_precision", cudf.Decimal64Dtype.MAX_PRECISION
@@ -482,7 +505,7 @@ def rand_dataframe(
 
     df = get_dataframe(
         Parameters(num_rows=rows, column_parameters=column_params, seed=seed,),
-        use_threads=use_threads,
+        use_threads=False,
     )
 
     return df
@@ -618,16 +641,30 @@ def make_lists(dtype, lists_max_length, nesting_depth, top_level_list):
     return top_level_list
 
 
-def make_array_for_struct(dtype, size):
+def make_array_for_struct(dtype, cardinality, size, max_null_frequency):
     """
     Helper to create a pa.array with `size` and `dtype`
     for a `StructArray`.
     """
 
+    null_frequency = np.random.uniform(low=0, high=max_null_frequency)
+    local_cardinality = max(np.random.randint(low=0, high=cardinality), 1)
     data = get_values_for_nested_data(
-        dtype=dtype.type.to_pandas_dtype(), size=size
+        dtype=dtype.type.to_pandas_dtype(), size=local_cardinality
+    )
+    vals = np.random.choice(data, size=size)
+
+    return pa.array(
+        vals,
+        mask=np.random.choice(
+            [True, False], size=size, p=[null_frequency, 1 - null_frequency],
+        )
+        if null_frequency > 0.0
+        else None,
+        size=size,
+        safe=False,
+        type=dtype.type,
     )
-    return pa.array(data, type=dtype.type)
 
 
 def get_nested_lists(dtype, size, nesting_depth, lists_max_length):
@@ -650,7 +687,7 @@ def get_nested_lists(dtype, size, nesting_depth, lists_max_length):
     return list_of_lists
 
 
-def get_nested_structs(dtype, size):
+def get_nested_structs(dtype, cardinality, size, max_null_frequency):
     """
     Returns a list of arrays with random data
     corresponding to the dtype provided.
@@ -659,9 +696,21 @@ def get_nested_structs(dtype, size):
     list_of_arrays = []
 
     for name, col_dtype in dtype.fields.items():
-        list_of_arrays.append(
-            make_array_for_struct(dtype=dtype._typ[name], size=size)
-        )
+        if isinstance(col_dtype, cudf.StructDtype):
+            result_arrays = get_nested_structs(
+                col_dtype, cardinality, size, max_null_frequency
+            )
+            result_arrays = pa.StructArray.from_arrays(
+                result_arrays, names=col_dtype.fields.keys()
+            )
+        else:
+            result_arrays = make_array_for_struct(
+                dtype=dtype._typ[name],
+                cardinality=cardinality,
+                size=size,
+                max_null_frequency=max_null_frequency,
+            )
+        list_of_arrays.append(result_arrays)
 
     return list_of_arrays
 
@@ -678,8 +727,30 @@ def list_generator(dtype, size, nesting_depth, lists_max_length):
     )
 
 
-def struct_generator(dtype, size):
+def struct_generator(dtype, cardinality, size, max_null_frequency):
     """
     Generator for struct data
     """
-    return lambda: get_nested_structs(dtype=dtype, size=size,)
+    return lambda: get_nested_structs(
+        dtype=dtype,
+        cardinality=cardinality,
+        size=size,
+        max_null_frequency=max_null_frequency,
+    )
+
+
+def create_nested_struct_type(max_types_at_each_level, nesting_level):
+    dtypes_list = cudf.utils.dtypes.ALL_TYPES - {
+        "category",
+        "datetime64[ns]",
+    } - cudf.utils.dtypes.TIMEDELTA_TYPES - {"uint32"} | {"struct"}
+    picked_types = np.random.choice(list(dtypes_list), max_types_at_each_level)
+    type_dict = {}
+    for name, type_ in enumerate(picked_types):
+        if type_ == "struct":
+            type_dict[str(name)] = create_nested_struct_type(
+                max_types_at_each_level, nesting_level - 1
+            )
+        else:
+            type_dict[str(name)] = cudf.dtype(type_)
+    return cudf.StructDtype(type_dict)

From 7b94eca9f6d9b89386cf396b06439cb92fdaa4f6 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 5 Oct 2021 18:06:57 -0700
Subject: [PATCH 3/7] add struct support

---
 python/cudf/cudf/_fuzz_testing/orc.py         |  7 +-
 .../cudf/_fuzz_testing/tests/fuzz_test_orc.py |  9 ++-
 python/cudf/cudf/_fuzz_testing/utils.py       | 72 +++++++++++++++----
 python/cudf/cudf/core/column/struct.py        | 12 ++++
 python/cudf/cudf/testing/dataset_generator.py |  7 +-
 python/cudf/cudf/tests/test_orc.py            |  2 +-
 6 files changed, 91 insertions(+), 18 deletions(-)

diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py
index 2aa01eb3967..d9b8173fcb0 100644
--- a/python/cudf/cudf/_fuzz_testing/orc.py
+++ b/python/cudf/cudf/_fuzz_testing/orc.py
@@ -63,7 +63,7 @@ def generate_input(self):
                 - cudf.utils.dtypes.UNSIGNED_TYPES
                 - {"datetime64[ns]"}
             )
-
+            dtypes_list = ["struct"]
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                 self, dtypes_list
             )
@@ -83,7 +83,10 @@ def generate_input(self):
         self._df = df
         file_obj = io.BytesIO()
         pandas_to_orc(
-            df, file_io_obj=file_obj, stripe_size=self._rand(len(df))
+            df,
+            file_io_obj=file_obj,
+            stripe_size=self._rand(len(df)),
+            arrow_table_schema=table.schema,
         )
         file_obj.seek(0)
         buf = file_obj.read()
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
index b3fd7e8c5a7..ad2943512fb 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
@@ -18,7 +18,7 @@
     data_handle=OrcReader,
     params={
         "columns": ALL_POSSIBLE_VALUES,
-        "skiprows": ALL_POSSIBLE_VALUES,
+        "skiprows": [None],
         "num_rows": ALL_POSSIBLE_VALUES,
         "use_index": ALL_POSSIBLE_VALUES,
     },
@@ -44,8 +44,13 @@ def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index):
         num_rows=num_rows,
         use_index=use_index,
     )
+    try:
+        compare_dataframe(expected_pdf, gdf)
+    except AssertionError:
+        import pdb
 
-    compare_dataframe(expected_pdf, gdf)
+        pdb.set_trace()
+        print("abc")
 
 
 @pythonfuzz(
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 9949d1068c7..1151919f0fd 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -6,6 +6,7 @@
 import fastavro
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 import pyorc
 
 import cudf
@@ -179,6 +180,8 @@ def pyarrow_to_pandas(table):
             df[column._name] = pd.Series(
                 column, dtype=pyarrow_dtypes_to_pandas_dtypes[column.type]
             )
+        elif isinstance(column.type, pa.StructType):
+            df[column._name] = column.to_pandas(integer_object_nulls=True)
         else:
             df[column._name] = column.to_pandas()
 
@@ -208,12 +211,21 @@ def get_orc_dtype_info(dtype):
     if dtype in PANDAS_TO_ORC_TYPES:
         return PANDAS_TO_ORC_TYPES[dtype]
     else:
+        # import pdb;pdb.set_trace()
         raise TypeError(
             f"Unsupported dtype({dtype}) according to orc spec:"
             f" https://orc.apache.org/specification/"
         )
 
 
+def get_arrow_dtype_info_for_pyorc(dtype):
+    if isinstance(dtype, pa.StructType):
+        return get_orc_schema(df=None, arrow_table_schema=dtype)
+    else:
+        pd_dtype = cudf.dtype(dtype.to_pandas_dtype())
+        return get_orc_dtype_info(pd_dtype)
+
+
 def get_avro_schema(df):
     fields = [
         {"name": col_name, "type": get_avro_dtype_info(col_dtype)}
@@ -223,11 +235,17 @@ def get_avro_schema(df):
     return schema
 
 
-def get_orc_schema(df):
-    ordered_dict = OrderedDict(
-        (col_name, get_orc_dtype_info(col_dtype))
-        for col_name, col_dtype in df.dtypes.items()
-    )
+def get_orc_schema(df, arrow_table_schema=None):
+    if arrow_table_schema is None:
+        ordered_dict = OrderedDict(
+            (col_name, get_orc_dtype_info(col_dtype))
+            for col_name, col_dtype in df.dtypes.items()
+        )
+    else:
+        ordered_dict = OrderedDict(
+            (field.name, get_arrow_dtype_info_for_pyorc(field.type))
+            for field in arrow_table_schema
+        )
 
     schema = pyorc.Struct(**ordered_dict)
     return schema
@@ -273,13 +291,25 @@ def pandas_to_avro(df, file_name=None, file_io_obj=None):
         fastavro.writer(file_io_obj, avro_schema, records)
 
 
-def _preprocess_to_orc_tuple(df):
+def _preprocess_to_orc_tuple(df, arrow_table_schema):
     def _null_to_None(value):
         if value is pd.NA or value is pd.NaT:
             return None
         else:
             return value
 
+    def sanitize(value, struct_type):
+        if value is None:
+            return None
+        # import pdb;pdb.set_trace()
+        values_list = []
+        for name, sub_type in struct_type.fields.items():
+            if isinstance(sub_type, cudf.StructDtype):
+                values_list.append(sanitize(value[name], sub_type))
+            else:
+                values_list.append(value[name])
+        return tuple(values_list)
+
     has_nulls_or_nullable_dtype = any(
         [
             True
@@ -289,20 +319,38 @@ def _null_to_None(value):
             for col in df.columns
         ]
     )
+    pdf = df.copy(deep=True)
+    for field in arrow_table_schema:
+        if isinstance(field.type, pa.StructType):
+            # import pdb;pdb.set_trace()
+            pdf[field.name] = pdf[field.name].apply(
+                sanitize, args=(cudf.StructDtype.from_arrow(field.type),)
+            )
+        else:
+            pdf[field.name] = pdf[field.name]
 
     tuple_list = [
         tuple(map(_null_to_None, tup)) if has_nulls_or_nullable_dtype else tup
-        for tup in df.itertuples(index=False, name=None)
+        for tup in pdf.itertuples(index=False, name=None)
     ]
 
-    return tuple_list
-
+    return tuple_list, pdf, df
 
-def pandas_to_orc(df, file_name=None, file_io_obj=None, stripe_size=67108864):
-    schema = get_orc_schema(df)
 
-    tuple_list = _preprocess_to_orc_tuple(df)
+def pandas_to_orc(
+    df,
+    file_name=None,
+    file_io_obj=None,
+    stripe_size=67108864,
+    arrow_table_schema=None,
+):
+    # import pdb;pdb.set_trace()
+    schema = get_orc_schema(df, arrow_table_schema=arrow_table_schema)
 
+    tuple_list, pdf, df = _preprocess_to_orc_tuple(
+        df, arrow_table_schema=arrow_table_schema
+    )
+    # import pdb;pdb.set_trace()
     if file_name is not None:
         with open(file_name, "wb") as data:
             with pyorc.Writer(data, schema, stripe_size=stripe_size) as writer:
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 7167918d14d..2680c241b6a 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 from __future__ import annotations
 
+import pandas as pd
 import pyarrow as pa
 
 import cudf
@@ -80,6 +81,17 @@ def to_arrow(self):
             pa_type, len(self), buffers, children=children
         )
 
+    def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
+        nullable = kwargs.get("nullable", False)
+        if nullable:
+            kwargs["integer_object_nulls"] = True
+        # pd_series = pd.Series(self.to_arrow().tolist())
+        pd_series = self.to_arrow().to_pandas(kwargs)
+
+        if index is not None:
+            pd_series.index = index
+        return pd_series
+
     def __getitem__(self, args):
         result = super().__getitem__(args)
         if isinstance(result, dict):
diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index bed0c554b90..c14694b8a86 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -776,7 +776,12 @@ def create_nested_struct_type(max_types_at_each_level, nesting_level):
     dtypes_list = cudf.utils.dtypes.ALL_TYPES - {
         "category",
         "datetime64[ns]",
-    } - cudf.utils.dtypes.TIMEDELTA_TYPES - {"uint32"} | {"struct"}
+        "str",
+    } - cudf.utils.dtypes.TIMEDELTA_TYPES - {
+        "uint32"
+    } - cudf.utils.dtypes.UNSIGNED_TYPES | {
+        "struct"
+    }
     picked_types = np.random.choice(list(dtypes_list), max_types_at_each_level)
     type_dict = {}
     for name, type_ in enumerate(picked_types):
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 1230b4b35f3..66e04d1d945 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -941,7 +941,7 @@ def generate_list_struct_buff(size=100_000):
             "struct_nests_list": struct_nests_list,
         }
     )
-
+    # import pdb;pdb.set_trace()
     writer = po.Writer(buff, schema, stripe_size=1024)
     tuples = list(
         map(

From 5f3e1f8f006971bd8d5568cafa837432304024df Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 19 Jan 2022 09:07:22 -0800
Subject: [PATCH 4/7] add struct.to_pandas()

---
 python/cudf/cudf/core/column/struct.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 2680c241b6a..a09f9296135 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -82,12 +82,7 @@ def to_arrow(self):
         )
 
     def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
-        nullable = kwargs.get("nullable", False)
-        if nullable:
-            kwargs["integer_object_nulls"] = True
-        # pd_series = pd.Series(self.to_arrow().tolist())
-        pd_series = self.to_arrow().to_pandas(kwargs)
-
+        pd_series = pd.Series(self.to_arrow().tolist(), dtype="object")
         if index is not None:
             pd_series.index = index
         return pd_series

From 9708092f1618f5c9b6c656617c63b3179ec64ec8 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 19 Jan 2022 09:11:44 -0800
Subject: [PATCH 5/7] resolve conflicts

---
 python/cudf/cudf/core/column/struct.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index a09f9296135..f0d02a706e2 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -82,7 +82,11 @@ def to_arrow(self):
         )
 
     def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
+        # We cannot go via Arrow's `to_pandas` because of the following issue:
+        # https://issues.apache.org/jira/browse/ARROW-12680
+
         pd_series = pd.Series(self.to_arrow().tolist(), dtype="object")
+
         if index is not None:
             pd_series.index = index
         return pd_series

From 3c1447dfff87bda158c74ba1e4cbb3e53f443150 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 19 Jan 2022 09:16:49 -0800
Subject: [PATCH 6/7] cleanup

---
 python/cudf/cudf/_fuzz_testing/main.py                | 4 ++--
 python/cudf/cudf/_fuzz_testing/orc.py                 | 2 +-
 python/cudf/cudf/_fuzz_testing/parquet.py             | 4 +---
 python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py | 7 +------
 python/cudf/cudf/_fuzz_testing/utils.py               | 7 ++-----
 python/cudf/cudf/testing/dataset_generator.py         | 2 +-
 python/cudf/cudf/tests/test_orc.py                    | 2 +-
 7 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/python/cudf/cudf/_fuzz_testing/main.py b/python/cudf/cudf/_fuzz_testing/main.py
index e7c0c326676..7b28a4c4970 100644
--- a/python/cudf/cudf/_fuzz_testing/main.py
+++ b/python/cudf/cudf/_fuzz_testing/main.py
@@ -13,8 +13,8 @@ def __init__(self, func, params=None, data_handle=None, **kwargs):
             dirs=kwargs.get("dir", None),
             crash_reports_dir=kwargs.get("crash_reports_dir", None),
             regression=kwargs.get("regression", False),
-            max_rows_size=kwargs.get("max_rows_size", 100),
-            max_cols_size=kwargs.get("max_cols_size", 50),
+            max_rows_size=kwargs.get("max_rows_size", 100_000),
+            max_cols_size=kwargs.get("max_cols_size", 1000),
             runs=kwargs.get("runs", -1),
             max_string_length=kwargs.get("max_string_length", None),
             params=params,
diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py
index d9b8173fcb0..78e01fb76a4 100644
--- a/python/cudf/cudf/_fuzz_testing/orc.py
+++ b/python/cudf/cudf/_fuzz_testing/orc.py
@@ -63,7 +63,7 @@ def generate_input(self):
                 - cudf.utils.dtypes.UNSIGNED_TYPES
                 - {"datetime64[ns]"}
             )
-            dtypes_list = ["struct"]
+
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                 self, dtypes_list
             )
diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py
index ecad18acc81..859d09b407f 100644
--- a/python/cudf/cudf/_fuzz_testing/parquet.py
+++ b/python/cudf/cudf/_fuzz_testing/parquet.py
@@ -59,7 +59,7 @@ def generate_input(self):
                 - {"uint32"}
                 | {"list", "decimal64"}
             )
-            dtypes_list = ["struct"]
+
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                 self, dtypes_list
             )
@@ -81,9 +81,7 @@ def generate_input(self):
         # https://issues.apache.org/jira/browse/ARROW-10123
 
         # file = io.BytesIO()
-        import pdb
 
-        pdb.set_trace()
         df.to_parquet("temp_file")
         # file.seek(0)
         # self._current_buffer = copy.copy(file.read())
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
index ad2943512fb..9089a40b89e 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
@@ -44,13 +44,8 @@ def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index):
         num_rows=num_rows,
         use_index=use_index,
     )
-    try:
-        compare_dataframe(expected_pdf, gdf)
-    except AssertionError:
-        import pdb
 
-        pdb.set_trace()
-        print("abc")
+    compare_dataframe(expected_pdf, gdf)
 
 
 @pythonfuzz(
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 5480781d9c4..87a8fc46374 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -213,7 +213,6 @@ def get_orc_dtype_info(dtype):
     if dtype in PANDAS_TO_ORC_TYPES:
         return PANDAS_TO_ORC_TYPES[dtype]
     else:
-        # import pdb;pdb.set_trace()
         raise TypeError(
             f"Unsupported dtype({dtype}) according to orc spec:"
             f" https://orc.apache.org/specification/"
@@ -303,7 +302,7 @@ def _null_to_None(value):
     def sanitize(value, struct_type):
         if value is None:
             return None
-        # import pdb;pdb.set_trace()
+
         values_list = []
         for name, sub_type in struct_type.fields.items():
             if isinstance(sub_type, cudf.StructDtype):
@@ -324,7 +323,6 @@ def sanitize(value, struct_type):
     pdf = df.copy(deep=True)
     for field in arrow_table_schema:
         if isinstance(field.type, pa.StructType):
-            # import pdb;pdb.set_trace()
             pdf[field.name] = pdf[field.name].apply(
                 sanitize, args=(cudf.StructDtype.from_arrow(field.type),)
             )
@@ -346,13 +344,12 @@ def pandas_to_orc(
     stripe_size=67108864,
     arrow_table_schema=None,
 ):
-    # import pdb;pdb.set_trace()
     schema = get_orc_schema(df, arrow_table_schema=arrow_table_schema)
 
     tuple_list, pdf, df = _preprocess_to_orc_tuple(
         df, arrow_table_schema=arrow_table_schema
     )
-    # import pdb;pdb.set_trace()
+
     if file_name is not None:
         with open(file_name, "wb") as data:
             with pyorc.Writer(data, schema, stripe_size=stripe_size) as writer:
diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index 2aa92322f19..682c4429c7d 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -551,7 +551,7 @@ def rand_dataframe(
 
     df = get_dataframe(
         Parameters(num_rows=rows, column_parameters=column_params, seed=seed,),
-        use_threads=False,
+        use_threads=use_threads,
     )
 
     return df
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 2c297476e91..44812f5aba4 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -945,7 +945,7 @@ def generate_list_struct_buff(size=100_000):
             "struct_nests_list": struct_nests_list,
         }
     )
-    # import pdb;pdb.set_trace()
+
     writer = po.Writer(buff, schema, stripe_size=1024)
     tuples = list(
         map(

From a65d4be131e2cf0f83626579847373216a99effe Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 19 Jan 2022 09:18:31 -0800
Subject: [PATCH 7/7] cleanup

---
 python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py |  2 +-
 python/cudf/cudf/testing/dataset_generator.py         | 10 +---------
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
index 9089a40b89e..b3fd7e8c5a7 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
@@ -18,7 +18,7 @@
     data_handle=OrcReader,
     params={
         "columns": ALL_POSSIBLE_VALUES,
-        "skiprows": [None],
+        "skiprows": ALL_POSSIBLE_VALUES,
         "num_rows": ALL_POSSIBLE_VALUES,
         "use_index": ALL_POSSIBLE_VALUES,
     },
diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index 682c4429c7d..e1c7b42c7a3 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -805,15 +805,7 @@ def struct_generator(dtype, cardinality, size, max_null_frequency):
 
 
 def create_nested_struct_type(max_types_at_each_level, nesting_level):
-    dtypes_list = cudf.utils.dtypes.ALL_TYPES - {
-        "category",
-        "datetime64[ns]",
-        "str",
-    } - cudf.utils.dtypes.TIMEDELTA_TYPES - {
-        "uint32"
-    } - cudf.utils.dtypes.UNSIGNED_TYPES | {
-        "struct"
-    }
+    dtypes_list = cudf.utils.dtypes.ALL_TYPES
     picked_types = np.random.choice(list(dtypes_list), max_types_at_each_level)
     type_dict = {}
     for name, type_ in enumerate(picked_types):