From 147f57b78c659e09f9c3f68eb36865a1579e0b0b Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 9 Feb 2021 12:09:31 -0800
Subject: [PATCH 1/5] updating orc fuzz tests

---
 python/cudf/cudf/_fuzz_testing/orc.py         | 35 +++++++++++++++----
 .../cudf/_fuzz_testing/tests/fuzz_test_orc.py | 16 ++++-----
 python/cudf/cudf/_fuzz_testing/utils.py       | 23 ++++++++++--
 3 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py
index 1f6f1c2507e..97a2b364233 100644
--- a/python/cudf/cudf/_fuzz_testing/orc.py
+++ b/python/cudf/cudf/_fuzz_testing/orc.py
@@ -67,6 +67,19 @@ def generate_input(self):
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                 self, dtypes_list
             )
+            if num_cols == 0:
+                """
+                If a dataframe has no columns, then pyorc writer will throw
+                the following error:
+                ValueError: Struct type must contain at least one sub type.
+                Hence this is a work-around to skip generating an empty
+                dataframe.
+                """
+                while num_cols == 0:
+                    dtypes_meta, num_rows, num_cols = _generate_rand_meta(
+                        self, dtypes_list
+                    )
+
             self._current_params["dtypes_meta"] = dtypes_meta
             seed = random.randint(0, 2 ** 32 - 1)
             self._current_params["seed"] = seed
@@ -106,7 +119,6 @@ def set_rand_params(self, params):
                 elif param == "stripes":
                     f = io.BytesIO(self._current_buffer)
                     reader = pyorc.Reader(f)
-                    print("READ: ", reader.num_of_stripes)
                     stripes = [i for i in range(reader.num_of_stripes)]
                     params_dict[param] = np.random.choice(
                         [
@@ -125,10 +137,10 @@ def set_rand_params(self, params):
                     )
                 elif param == "use_index":
                     params_dict[param] = np.random.choice([True, False])
-            elif param in ("skiprows", "num_rows"):
-                params_dict[param] = np.random.choice(
-                    [None, self._rand(len(self._df))]
-                )
+                elif param in ("skiprows", "num_rows"):
+                    params_dict[param] = np.random.choice(
+                        [None, self._rand(len(self._df))]
+                    )
             else:
                 if not isinstance(values, list):
                     raise TypeError("values must be of type list")
@@ -143,12 +155,16 @@ def __init__(
         max_rows=100_000,
         max_columns=1000,
         max_string_length=None,
+        max_lists_length=None,
+        max_lists_nesting_depth=None,
     ):
         super().__init__(
             dirs=dirs,
             max_rows=max_rows,
             max_columns=max_columns,
             max_string_length=max_string_length,
+            max_lists_length=None,
+            max_lists_nesting_depth=None,
         )
         self._df = None
 
@@ -163,11 +179,18 @@ def generate_input(self):
         else:
             dtypes_list = list(
                 cudf.utils.dtypes.ALL_TYPES
-                - {"category"}
+                # TODO: Remove "bool" from below
+                # list after following issue is fixed:
+                # https://github.com/rapidsai/cudf/issues/6763
+                - {"category", "bool"}
                 # Following dtypes are not supported by orc
                 # https://orc.apache.org/specification/ORCv0/
                 - cudf.utils.dtypes.TIMEDELTA_TYPES
                 - cudf.utils.dtypes.UNSIGNED_TYPES
+                # TODO: Remove `DATETIME_TYPES` once
+                # following bug is fixed:
+                # https://github.com/rapidsai/cudf/issues/7355
+                - cudf.utils.dtypes.DATETIME_TYPES
             )
 
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
index ed7e5b078c6..ff1fbcfe635 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
@@ -12,7 +12,6 @@
     orc_to_pandas,
     run_test,
 )
-from cudf.tests.utils import assert_eq
 
 
 @pythonfuzz(
@@ -24,19 +23,14 @@
         "use_index": ALL_POSSIBLE_VALUES,
     },
 )
-def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index):
-    # TODO: Remove skiprows=0 after
-    # following issue is fixed:
-    # https://github.com/rapidsai/cudf/issues/6563
-    skiprows = 0
-
+def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index):
     pdf, file_buffer = input_tuple
     expected_pdf = pdf.iloc[skiprows:]
     if num_rows is not None:
         expected_pdf = expected_pdf.head(num_rows)
     if skiprows is not None or num_rows is not None:
         expected_pdf.reset_index(drop=True, inplace=True)
-    if columns is not None:
+    if columns is not None and len(columns) > 0:
         expected_pdf = expected_pdf[columns]
     if use_index is False:
         expected_pdf.reset_index(drop=True, inplace=True)
@@ -48,6 +42,7 @@ def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index):
         num_rows=num_rows,
         use_index=use_index,
     )
+
     compare_dataframe(expected_pdf, gdf)
 
 
@@ -61,14 +56,14 @@ def orc_reader_stripes_test(input_tuple, columns, stripes):
         file_io_obj=io.BytesIO(file_buffer), stripes=stripes
     )
 
-    if columns is not None:
+    if columns is not None and len(columns) > 0:
         expected_pdf = expected_pdf[columns]
 
     gdf = cudf.read_orc(
         io.BytesIO(file_buffer), columns=columns, stripes=stripes
     )
 
-    assert_eq(expected_pdf, gdf, check_dtype=False)
+    compare_dataframe(expected_pdf, gdf)
 
 
 @pythonfuzz(
@@ -91,6 +86,7 @@ def orc_writer_test(pdf, compression, enable_statistics):
     file_to_strore.seek(0)
 
     actual_df = cudf.read_orc(file_to_strore)
+
     compare_dataframe(pdf, actual_df)
 
 
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index fd96da3cf98..11816b2687d 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -58,6 +58,18 @@
     np.dtype("<M8[us]"): pyorc.Timestamp(),
 }
 
+ORC_TO_PANDAS_TYPES = {
+    pyorc.TinyInt().name: pd.Int8Dtype(),
+    pyorc.Int().name: pd.Int32Dtype(),
+    pyorc.Boolean().name: pd.BooleanDtype(),
+    pyorc.SmallInt().name: pd.Int16Dtype(),
+    pyorc.BigInt().name: pd.Int64Dtype(),
+    pyorc.String().name: np.dtype("O"),
+    pyorc.Float().name: np.dtype("float32"),
+    pyorc.Double().name: np.dtype("float64"),
+    pyorc.Timestamp().name: np.dtype("<M8[ns]"),
+}
+
 
 def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
     obj._current_params = {}
@@ -73,7 +85,7 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
             if null_frequency_override is None
             else null_frequency_override
         )
-        cardinality = obj._rand(obj._max_rows)
+        cardinality = max(1, obj._rand(obj._max_rows))
         meta = dict()
         if dtype == "str":
             # We want to operate near the limits of string column
@@ -190,7 +202,8 @@ def get_avro_schema(df):
 
 def get_orc_schema(df):
     ordered_dict = OrderedDict(
-        (col_name, col_dtype) for col_name, col_dtype in df.dtypes.items()
+        (col_name, get_orc_dtype_info(col_dtype))
+        for col_name, col_dtype in df.dtypes.items()
     )
 
     schema = pyorc.Struct(**ordered_dict)
@@ -288,6 +301,11 @@ def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None):
 
     reader = pyorc.Reader(f)
 
+    dtypes = {
+        col: ORC_TO_PANDAS_TYPES[pyorc_type.name]
+        for col, pyorc_type in reader.schema.fields.items()
+    }
+
     if stripes is None:
         df = pd.DataFrame.from_records(
             reader, columns=reader.schema.fields.keys()
@@ -299,6 +317,7 @@ def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None):
         df = pd.DataFrame.from_records(
             records, columns=reader.schema.fields.keys()
         )
+    df = df.astype(dtypes)
 
     return df
 

From 537e3aa7072478e9a6ebbfe7cfb8aa3011eb665b Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 9 Feb 2021 14:59:08 -0800
Subject: [PATCH 2/5] fix empty struct writing error

---
 python/cudf/cudf/_fuzz_testing/orc.py   | 12 ------------
 python/cudf/cudf/_fuzz_testing/utils.py |  6 ++----
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py
index 97a2b364233..560de4a04fa 100644
--- a/python/cudf/cudf/_fuzz_testing/orc.py
+++ b/python/cudf/cudf/_fuzz_testing/orc.py
@@ -67,18 +67,6 @@ def generate_input(self):
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                 self, dtypes_list
             )
-            if num_cols == 0:
-                """
-                If a dataframe has no columns, then pyorc writer will throw
-                the following error:
-                ValueError: Struct type must contain at least one sub type.
-                Hence this is a work-around to skip generating an empty
-                dataframe.
-                """
-                while num_cols == 0:
-                    dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                        self, dtypes_list
-                    )
 
             self._current_params["dtypes_meta"] = dtypes_meta
             seed = random.randint(0, 2 ** 32 - 1)
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 11816b2687d..9314e9d1c57 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -282,13 +282,11 @@ def pandas_to_orc(df, file_name=None, file_io_obj=None, stripe_size=67108864):
 
     if file_name is not None:
         with open(file_name, "wb") as data:
-            with pyorc.Writer(
-                data, str(schema), stripe_size=stripe_size
-            ) as writer:
+            with pyorc.Writer(data, schema, stripe_size=stripe_size) as writer:
                 writer.writerows(tuple_list)
     elif file_io_obj is not None:
         with pyorc.Writer(
-            file_io_obj, str(schema), stripe_size=stripe_size
+            file_io_obj, schema, stripe_size=stripe_size
         ) as writer:
             writer.writerows(tuple_list)
 

From 794f421ecae95c1f4f9798ecfb660373464e6353 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 9 Feb 2021 15:10:33 -0800
Subject: [PATCH 3/5] add comments

---
 python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py | 4 ++++
 python/cudf/cudf/_fuzz_testing/utils.py               | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
index ff1fbcfe635..b304f2595f2 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
@@ -31,6 +31,8 @@ def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index):
     if skiprows is not None or num_rows is not None:
         expected_pdf.reset_index(drop=True, inplace=True)
     if columns is not None and len(columns) > 0:
+        # ORC reader picks columns if only
+        # there are any elements in `columns` only
         expected_pdf = expected_pdf[columns]
     if use_index is False:
         expected_pdf.reset_index(drop=True, inplace=True)
@@ -57,6 +59,8 @@ def orc_reader_stripes_test(input_tuple, columns, stripes):
     )
 
     if columns is not None and len(columns) > 0:
+        # ORC reader picks columns if only
+        # there are any elements in `columns` only
         expected_pdf = expected_pdf[columns]
 
     gdf = cudf.read_orc(
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 9314e9d1c57..4e7a6e1dabf 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -85,6 +85,7 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
             if null_frequency_override is None
             else null_frequency_override
         )
+        # `cardinality` has to be atleast 1.
         cardinality = max(1, obj._rand(obj._max_rows))
         meta = dict()
         if dtype == "str":
@@ -315,6 +316,9 @@ def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None):
         df = pd.DataFrame.from_records(
             records, columns=reader.schema.fields.keys()
         )
+
+    # Need to type-cast to extracted `dtypes` from pyorc schema because
+    # a fully empty/ full <NA> can result in incorrect dtype by pandas.
     df = df.astype(dtypes)
 
     return df

From c567808b0695ba9a21c60e3e78b96ff2532cc22b Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 10 Feb 2021 08:31:55 -0600
Subject: [PATCH 4/5] Apply suggestions from code review

Co-authored-by: Ram (Ramakrishna Prabhu) <42624703+rgsl888prabhu@users.noreply.github.com>
---
 python/cudf/cudf/_fuzz_testing/orc.py                 | 4 ++--
 python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py
index 560de4a04fa..607294a49c9 100644
--- a/python/cudf/cudf/_fuzz_testing/orc.py
+++ b/python/cudf/cudf/_fuzz_testing/orc.py
@@ -151,8 +151,8 @@ def __init__(
             max_rows=max_rows,
             max_columns=max_columns,
             max_string_length=max_string_length,
-            max_lists_length=None,
-            max_lists_nesting_depth=None,
+            max_lists_length=max_lists_length,
+            max_lists_nesting_depth=max_lists_nesting_depth,
         )
         self._df = None
 
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
index b304f2595f2..b3fd7e8c5a7 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
@@ -32,7 +32,7 @@ def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index):
         expected_pdf.reset_index(drop=True, inplace=True)
     if columns is not None and len(columns) > 0:
         # ORC reader picks columns if only
-        # there are any elements in `columns` only
+        # there are any elements in `columns`
         expected_pdf = expected_pdf[columns]
     if use_index is False:
         expected_pdf.reset_index(drop=True, inplace=True)
@@ -60,7 +60,7 @@ def orc_reader_stripes_test(input_tuple, columns, stripes):
 
     if columns is not None and len(columns) > 0:
         # ORC reader picks columns if only
-        # there are any elements in `columns` only
+        # there are any elements in `columns`
         expected_pdf = expected_pdf[columns]
 
     gdf = cudf.read_orc(

From b5f81652ac1cb9b17b807d58c2d4d85528802800 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 11 Feb 2021 12:31:17 -0600
Subject: [PATCH 5/5] Update python/cudf/cudf/_fuzz_testing/utils.py

Co-authored-by: Vukasin Milovanovic <vukasin.milovanovic.87@gmail.com>
---
 python/cudf/cudf/_fuzz_testing/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 4e7a6e1dabf..efcbd8ca792 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -85,7 +85,7 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
             if null_frequency_override is None
             else null_frequency_override
         )
-        # `cardinality` has to be atleast 1.
+        # `cardinality` has to be at least 1.
         cardinality = max(1, obj._rand(obj._max_rows))
         meta = dict()
         if dtype == "str":