diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index 1f6f1c2507e..607294a49c9 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -67,6 +67,7 @@ def generate_input(self): dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list ) + self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2 ** 32 - 1) self._current_params["seed"] = seed @@ -106,7 +107,6 @@ def set_rand_params(self, params): elif param == "stripes": f = io.BytesIO(self._current_buffer) reader = pyorc.Reader(f) - print("READ: ", reader.num_of_stripes) stripes = [i for i in range(reader.num_of_stripes)] params_dict[param] = np.random.choice( [ @@ -125,10 +125,10 @@ def set_rand_params(self, params): ) elif param == "use_index": params_dict[param] = np.random.choice([True, False]) - elif param in ("skiprows", "num_rows"): - params_dict[param] = np.random.choice( - [None, self._rand(len(self._df))] - ) + elif param in ("skiprows", "num_rows"): + params_dict[param] = np.random.choice( + [None, self._rand(len(self._df))] + ) else: if not isinstance(values, list): raise TypeError("values must be of type list") @@ -143,12 +143,16 @@ def __init__( max_rows=100_000, max_columns=1000, max_string_length=None, + max_lists_length=None, + max_lists_nesting_depth=None, ): super().__init__( dirs=dirs, max_rows=max_rows, max_columns=max_columns, max_string_length=max_string_length, + max_lists_length=max_lists_length, + max_lists_nesting_depth=max_lists_nesting_depth, ) self._df = None @@ -163,11 +167,18 @@ def generate_input(self): else: dtypes_list = list( cudf.utils.dtypes.ALL_TYPES - - {"category"} + # TODO: Remove "bool" from below + # list after following issue is fixed: + # https://github.com/rapidsai/cudf/issues/6763 + - {"category", "bool"} # Following dtypes are not supported by orc # https://orc.apache.org/specification/ORCv0/ - cudf.utils.dtypes.TIMEDELTA_TYPES - cudf.utils.dtypes.UNSIGNED_TYPES + # TODO: Remove `DATETIME_TYPES` once + # following bug is fixed: + # https://github.com/rapidsai/cudf/issues/7355 + - cudf.utils.dtypes.DATETIME_TYPES ) dtypes_meta, num_rows, num_cols = _generate_rand_meta( diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py index ed7e5b078c6..b3fd7e8c5a7 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py @@ -12,7 +12,6 @@ orc_to_pandas, run_test, ) -from cudf.tests.utils import assert_eq @pythonfuzz( @@ -24,19 +23,16 @@ "use_index": ALL_POSSIBLE_VALUES, }, ) -def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index): - # TODO: Remove skiprows=0 after - # following issue is fixed: - # https://github.com/rapidsai/cudf/issues/6563 - skiprows = 0 - +def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index): pdf, file_buffer = input_tuple expected_pdf = pdf.iloc[skiprows:] if num_rows is not None: expected_pdf = expected_pdf.head(num_rows) if skiprows is not None or num_rows is not None: expected_pdf.reset_index(drop=True, inplace=True) - if columns is not None: + if columns is not None and len(columns) > 0: + # ORC reader picks columns if only + # there are any elements in `columns` expected_pdf = expected_pdf[columns] if use_index is False: expected_pdf.reset_index(drop=True, inplace=True) @@ -48,6 +44,7 @@ def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index): num_rows=num_rows, use_index=use_index, ) + compare_dataframe(expected_pdf, gdf) @@ -61,14 +58,16 @@ def orc_reader_stripes_test(input_tuple, columns, stripes): file_io_obj=io.BytesIO(file_buffer), stripes=stripes ) - if columns is not None: + if columns is not None and len(columns) > 0: + # ORC reader picks columns if only + # there are any elements in `columns` expected_pdf = expected_pdf[columns] gdf = cudf.read_orc( io.BytesIO(file_buffer), columns=columns, stripes=stripes ) - assert_eq(expected_pdf, gdf, check_dtype=False) + compare_dataframe(expected_pdf, gdf) @pythonfuzz( @@ -91,6 +90,7 @@ def orc_writer_test(pdf, compression, enable_statistics): file_to_strore.seek(0) actual_df = cudf.read_orc(file_to_strore) + compare_dataframe(pdf, actual_df) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index fd96da3cf98..efcbd8ca792 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -58,6 +58,18 @@ np.dtype(" can result in incorrect dtype by pandas. + df = df.astype(dtypes) + return df