Update orc reader and writer fuzz tests (#7357)

This PR introduces: - [x] Fixes to some of the breakages introduced by the latest `pyorc` in using `pyorc.Struct`. - [x] Adapt to `list` dtype parameter changes introduced previously. - [x] Misc fixes required for proper fuzz test runs. Authors: - GALI PREM SAGAR (@galipremsagar) Approvers: - Vukasin Milovanovic (@vuule) - Ram (Ramakrishna Prabhu) (@rgsl888prabhu) URL: #7357
rapidsai · Feb 11, 2021 · ebe307e · ebe307e
1 parent 21d2ce6
commit ebe307e
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 22 deletions.
diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py
@@ -67,6 +67,7 @@ def generate_input(self):
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                 self, dtypes_list
             )
+
             self._current_params["dtypes_meta"] = dtypes_meta
             seed = random.randint(0, 2 ** 32 - 1)
             self._current_params["seed"] = seed
@@ -106,7 +107,6 @@ def set_rand_params(self, params):
                 elif param == "stripes":
                     f = io.BytesIO(self._current_buffer)
                     reader = pyorc.Reader(f)
-                    print("READ: ", reader.num_of_stripes)
                     stripes = [i for i in range(reader.num_of_stripes)]
                     params_dict[param] = np.random.choice(
                         [
@@ -125,10 +125,10 @@ def set_rand_params(self, params):
                     )
                 elif param == "use_index":
                     params_dict[param] = np.random.choice([True, False])
-            elif param in ("skiprows", "num_rows"):
-                params_dict[param] = np.random.choice(
-                    [None, self._rand(len(self._df))]
-                )
+                elif param in ("skiprows", "num_rows"):
+                    params_dict[param] = np.random.choice(
+                        [None, self._rand(len(self._df))]
+                    )
             else:
                 if not isinstance(values, list):
                     raise TypeError("values must be of type list")
@@ -143,12 +143,16 @@ def __init__(
         max_rows=100_000,
         max_columns=1000,
         max_string_length=None,
+        max_lists_length=None,
+        max_lists_nesting_depth=None,
     ):
         super().__init__(
             dirs=dirs,
             max_rows=max_rows,
             max_columns=max_columns,
             max_string_length=max_string_length,
+            max_lists_length=max_lists_length,
+            max_lists_nesting_depth=max_lists_nesting_depth,
         )
         self._df = None
 
@@ -163,11 +167,18 @@ def generate_input(self):
         else:
             dtypes_list = list(
                 cudf.utils.dtypes.ALL_TYPES
-                - {"category"}
+                # TODO: Remove "bool" from below
+                # list after following issue is fixed:
+                # https://github.com/rapidsai/cudf/issues/6763
+                - {"category", "bool"}
                 # Following dtypes are not supported by orc
                 # https://orc.apache.org/specification/ORCv0/
                 - cudf.utils.dtypes.TIMEDELTA_TYPES
                 - cudf.utils.dtypes.UNSIGNED_TYPES
+                # TODO: Remove `DATETIME_TYPES` once
+                # following bug is fixed:
+                # https://github.com/rapidsai/cudf/issues/7355
+                - cudf.utils.dtypes.DATETIME_TYPES
             )
 
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(

diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
@@ -12,7 +12,6 @@
     orc_to_pandas,
     run_test,
 )
-from cudf.tests.utils import assert_eq
 
 
 @pythonfuzz(
@@ -24,19 +23,16 @@
         "use_index": ALL_POSSIBLE_VALUES,
     },
 )
-def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index):
-    # TODO: Remove skiprows=0 after
-    # following issue is fixed:
-    # https://github.com/rapidsai/cudf/issues/6563
-    skiprows = 0
-
+def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index):
     pdf, file_buffer = input_tuple
     expected_pdf = pdf.iloc[skiprows:]
     if num_rows is not None:
         expected_pdf = expected_pdf.head(num_rows)
     if skiprows is not None or num_rows is not None:
         expected_pdf.reset_index(drop=True, inplace=True)
-    if columns is not None:
+    if columns is not None and len(columns) > 0:
+        # ORC reader picks columns if only
+        # there are any elements in `columns`
         expected_pdf = expected_pdf[columns]
     if use_index is False:
         expected_pdf.reset_index(drop=True, inplace=True)
@@ -48,6 +44,7 @@ def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index):
         num_rows=num_rows,
         use_index=use_index,
     )
+
     compare_dataframe(expected_pdf, gdf)
 
 
@@ -61,14 +58,16 @@ def orc_reader_stripes_test(input_tuple, columns, stripes):
         file_io_obj=io.BytesIO(file_buffer), stripes=stripes
     )
 
-    if columns is not None:
+    if columns is not None and len(columns) > 0:
+        # ORC reader picks columns if only
+        # there are any elements in `columns`
         expected_pdf = expected_pdf[columns]
 
     gdf = cudf.read_orc(
         io.BytesIO(file_buffer), columns=columns, stripes=stripes
     )
 
-    assert_eq(expected_pdf, gdf, check_dtype=False)
+    compare_dataframe(expected_pdf, gdf)
 
 
 @pythonfuzz(
@@ -91,6 +90,7 @@ def orc_writer_test(pdf, compression, enable_statistics):
     file_to_strore.seek(0)
 
     actual_df = cudf.read_orc(file_to_strore)
+
     compare_dataframe(pdf, actual_df)
 
 

diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -58,6 +58,18 @@
     np.dtype("<M8[us]"): pyorc.Timestamp(),
 }
 
+ORC_TO_PANDAS_TYPES = {
+    pyorc.TinyInt().name: pd.Int8Dtype(),
+    pyorc.Int().name: pd.Int32Dtype(),
+    pyorc.Boolean().name: pd.BooleanDtype(),
+    pyorc.SmallInt().name: pd.Int16Dtype(),
+    pyorc.BigInt().name: pd.Int64Dtype(),
+    pyorc.String().name: np.dtype("O"),
+    pyorc.Float().name: np.dtype("float32"),
+    pyorc.Double().name: np.dtype("float64"),
+    pyorc.Timestamp().name: np.dtype("<M8[ns]"),
+}
+
 
 def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
     obj._current_params = {}
@@ -73,7 +85,8 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
             if null_frequency_override is None
             else null_frequency_override
         )
-        cardinality = obj._rand(obj._max_rows)
+        # `cardinality` has to be at least 1.
+        cardinality = max(1, obj._rand(obj._max_rows))
         meta = dict()
         if dtype == "str":
             # We want to operate near the limits of string column
@@ -190,7 +203,8 @@ def get_avro_schema(df):
 
 def get_orc_schema(df):
     ordered_dict = OrderedDict(
-        (col_name, col_dtype) for col_name, col_dtype in df.dtypes.items()
+        (col_name, get_orc_dtype_info(col_dtype))
+        for col_name, col_dtype in df.dtypes.items()
     )
 
     schema = pyorc.Struct(**ordered_dict)
@@ -269,13 +283,11 @@ def pandas_to_orc(df, file_name=None, file_io_obj=None, stripe_size=67108864):
 
     if file_name is not None:
         with open(file_name, "wb") as data:
-            with pyorc.Writer(
-                data, str(schema), stripe_size=stripe_size
-            ) as writer:
+            with pyorc.Writer(data, schema, stripe_size=stripe_size) as writer:
                 writer.writerows(tuple_list)
     elif file_io_obj is not None:
         with pyorc.Writer(
-            file_io_obj, str(schema), stripe_size=stripe_size
+            file_io_obj, schema, stripe_size=stripe_size
         ) as writer:
             writer.writerows(tuple_list)
 
@@ -288,6 +300,11 @@ def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None):
 
     reader = pyorc.Reader(f)
 
+    dtypes = {
+        col: ORC_TO_PANDAS_TYPES[pyorc_type.name]
+        for col, pyorc_type in reader.schema.fields.items()
+    }
+
     if stripes is None:
         df = pd.DataFrame.from_records(
             reader, columns=reader.schema.fields.keys()
@@ -300,6 +317,10 @@ def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None):
             records, columns=reader.schema.fields.keys()
         )
 
+    # Need to type-cast to extracted `dtypes` from pyorc schema because
+    # a fully empty/ full <NA> can result in incorrect dtype by pandas.
+    df = df.astype(dtypes)
+
     return df