From 147f57b78c659e09f9c3f68eb36865a1579e0b0b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 9 Feb 2021 12:09:31 -0800 Subject: [PATCH 1/5] updating orc fuzz tests --- python/cudf/cudf/_fuzz_testing/orc.py | 35 +++++++++++++++---- .../cudf/_fuzz_testing/tests/fuzz_test_orc.py | 16 ++++----- python/cudf/cudf/_fuzz_testing/utils.py | 23 ++++++++++-- 3 files changed, 56 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index 1f6f1c2507e..97a2b364233 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -67,6 +67,19 @@ def generate_input(self): dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list ) + if num_cols == 0: + """ + If a dataframe has no columns, then pyorc writer will throw + the following error: + ValueError: Struct type must contain at least one sub type. + Hence this is a work-around to skip generating an empty + dataframe. + """ + while num_cols == 0: + dtypes_meta, num_rows, num_cols = _generate_rand_meta( + self, dtypes_list + ) + self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2 ** 32 - 1) self._current_params["seed"] = seed @@ -106,7 +119,6 @@ def set_rand_params(self, params): elif param == "stripes": f = io.BytesIO(self._current_buffer) reader = pyorc.Reader(f) - print("READ: ", reader.num_of_stripes) stripes = [i for i in range(reader.num_of_stripes)] params_dict[param] = np.random.choice( [ @@ -125,10 +137,10 @@ def set_rand_params(self, params): ) elif param == "use_index": params_dict[param] = np.random.choice([True, False]) - elif param in ("skiprows", "num_rows"): - params_dict[param] = np.random.choice( - [None, self._rand(len(self._df))] - ) + elif param in ("skiprows", "num_rows"): + params_dict[param] = np.random.choice( + [None, self._rand(len(self._df))] + ) else: if not isinstance(values, list): raise TypeError("values must be of type list") @@ -143,12 +155,16 @@ def __init__( max_rows=100_000, max_columns=1000, max_string_length=None, + max_lists_length=None, + max_lists_nesting_depth=None, ): super().__init__( dirs=dirs, max_rows=max_rows, max_columns=max_columns, max_string_length=max_string_length, + max_lists_length=None, + max_lists_nesting_depth=None, ) self._df = None @@ -163,11 +179,18 @@ def generate_input(self): else: dtypes_list = list( cudf.utils.dtypes.ALL_TYPES - - {"category"} + # TODO: Remove "bool" from below + # list after following issue is fixed: + # https://github.com/rapidsai/cudf/issues/6763 + - {"category", "bool"} # Following dtypes are not supported by orc # https://orc.apache.org/specification/ORCv0/ - cudf.utils.dtypes.TIMEDELTA_TYPES - cudf.utils.dtypes.UNSIGNED_TYPES + # TODO: Remove `DATETIME_TYPES` once + # following bug is fixed: + # https://github.com/rapidsai/cudf/issues/7355 + - cudf.utils.dtypes.DATETIME_TYPES ) dtypes_meta, num_rows, num_cols = _generate_rand_meta( diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py index ed7e5b078c6..ff1fbcfe635 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py @@ -12,7 +12,6 @@ orc_to_pandas, run_test, ) -from cudf.tests.utils import assert_eq @pythonfuzz( @@ -24,19 +23,14 @@ "use_index": ALL_POSSIBLE_VALUES, }, ) -def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index): - # TODO: Remove skiprows=0 after - # following issue is fixed: - # https://github.com/rapidsai/cudf/issues/6563 - skiprows = 0 - +def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index): pdf, file_buffer = input_tuple expected_pdf = pdf.iloc[skiprows:] if num_rows is not None: expected_pdf = expected_pdf.head(num_rows) if skiprows is not None or num_rows is not None: expected_pdf.reset_index(drop=True, inplace=True) - if columns is not None: + if columns is not None and len(columns) > 0: expected_pdf = expected_pdf[columns] if use_index is False: expected_pdf.reset_index(drop=True, inplace=True) @@ -48,6 +42,7 @@ def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index): num_rows=num_rows, use_index=use_index, ) + compare_dataframe(expected_pdf, gdf) @@ -61,14 +56,14 @@ def orc_reader_stripes_test(input_tuple, columns, stripes): file_io_obj=io.BytesIO(file_buffer), stripes=stripes ) - if columns is not None: + if columns is not None and len(columns) > 0: expected_pdf = expected_pdf[columns] gdf = cudf.read_orc( io.BytesIO(file_buffer), columns=columns, stripes=stripes ) - assert_eq(expected_pdf, gdf, check_dtype=False) + compare_dataframe(expected_pdf, gdf) @pythonfuzz( @@ -91,6 +86,7 @@ def orc_writer_test(pdf, compression, enable_statistics): file_to_strore.seek(0) actual_df = cudf.read_orc(file_to_strore) + compare_dataframe(pdf, actual_df) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index fd96da3cf98..11816b2687d 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -58,6 +58,18 @@ np.dtype(" Date: Tue, 9 Feb 2021 14:59:08 -0800 Subject: [PATCH 2/5] fix empty struct writing error --- python/cudf/cudf/_fuzz_testing/orc.py | 12 ------------ python/cudf/cudf/_fuzz_testing/utils.py | 6 ++---- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index 97a2b364233..560de4a04fa 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -67,18 +67,6 @@ def generate_input(self): dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list ) - if num_cols == 0: - """ - If a dataframe has no columns, then pyorc writer will throw - the following error: - ValueError: Struct type must contain at least one sub type. - Hence this is a work-around to skip generating an empty - dataframe. - """ - while num_cols == 0: - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2 ** 32 - 1) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 11816b2687d..9314e9d1c57 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -282,13 +282,11 @@ def pandas_to_orc(df, file_name=None, file_io_obj=None, stripe_size=67108864): if file_name is not None: with open(file_name, "wb") as data: - with pyorc.Writer( - data, str(schema), stripe_size=stripe_size - ) as writer: + with pyorc.Writer(data, schema, stripe_size=stripe_size) as writer: writer.writerows(tuple_list) elif file_io_obj is not None: with pyorc.Writer( - file_io_obj, str(schema), stripe_size=stripe_size + file_io_obj, schema, stripe_size=stripe_size ) as writer: writer.writerows(tuple_list) From 794f421ecae95c1f4f9798ecfb660373464e6353 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 9 Feb 2021 15:10:33 -0800 Subject: [PATCH 3/5] add comments --- python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py | 4 ++++ python/cudf/cudf/_fuzz_testing/utils.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py index ff1fbcfe635..b304f2595f2 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py @@ -31,6 +31,8 @@ def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index): if skiprows is not None or num_rows is not None: expected_pdf.reset_index(drop=True, inplace=True) if columns is not None and len(columns) > 0: + # ORC reader picks columns if only + # there are any elements in `columns` only expected_pdf = expected_pdf[columns] if use_index is False: expected_pdf.reset_index(drop=True, inplace=True) @@ -57,6 +59,8 @@ def orc_reader_stripes_test(input_tuple, columns, stripes): ) if columns is not None and len(columns) > 0: + # ORC reader picks columns if only + # there are any elements in `columns` only expected_pdf = expected_pdf[columns] gdf = cudf.read_orc( diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 9314e9d1c57..4e7a6e1dabf 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -85,6 +85,7 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): if null_frequency_override is None else null_frequency_override ) + # `cardinality` has to be atleast 1. cardinality = max(1, obj._rand(obj._max_rows)) meta = dict() if dtype == "str": @@ -315,6 +316,9 @@ def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None): df = pd.DataFrame.from_records( records, columns=reader.schema.fields.keys() ) + + # Need to type-cast to extracted `dtypes` from pyorc schema because + # a fully empty/ full can result in incorrect dtype by pandas. df = df.astype(dtypes) return df From c567808b0695ba9a21c60e3e78b96ff2532cc22b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 10 Feb 2021 08:31:55 -0600 Subject: [PATCH 4/5] Apply suggestions from code review Co-authored-by: Ram (Ramakrishna Prabhu) <42624703+rgsl888prabhu@users.noreply.github.com> --- python/cudf/cudf/_fuzz_testing/orc.py | 4 ++-- python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index 560de4a04fa..607294a49c9 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -151,8 +151,8 @@ def __init__( max_rows=max_rows, max_columns=max_columns, max_string_length=max_string_length, - max_lists_length=None, - max_lists_nesting_depth=None, + max_lists_length=max_lists_length, + max_lists_nesting_depth=max_lists_nesting_depth, ) self._df = None diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py index b304f2595f2..b3fd7e8c5a7 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py @@ -32,7 +32,7 @@ def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index): expected_pdf.reset_index(drop=True, inplace=True) if columns is not None and len(columns) > 0: # ORC reader picks columns if only - # there are any elements in `columns` only + # there are any elements in `columns` expected_pdf = expected_pdf[columns] if use_index is False: expected_pdf.reset_index(drop=True, inplace=True) @@ -60,7 +60,7 @@ def orc_reader_stripes_test(input_tuple, columns, stripes): if columns is not None and len(columns) > 0: # ORC reader picks columns if only - # there are any elements in `columns` only + # there are any elements in `columns` expected_pdf = expected_pdf[columns] gdf = cudf.read_orc( From b5f81652ac1cb9b17b807d58c2d4d85528802800 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 11 Feb 2021 12:31:17 -0600 Subject: [PATCH 5/5] Update python/cudf/cudf/_fuzz_testing/utils.py Co-authored-by: Vukasin Milovanovic --- python/cudf/cudf/_fuzz_testing/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 4e7a6e1dabf..efcbd8ca792 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -85,7 +85,7 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): if null_frequency_override is None else null_frequency_override ) - # `cardinality` has to be atleast 1. + # `cardinality` has to be at least 1. cardinality = max(1, obj._rand(obj._max_rows)) meta = dict() if dtype == "str":