Skip to content

Commit

Permalink
Update orc reader and writer fuzz tests (#7357)
Browse files Browse the repository at this point in the history
This PR introduces:

- [x] Fixes to some of the breakages introduced by the latest `pyorc` in using `pyorc.Struct`.
- [x] Adapt to `list` dtype parameter changes introduced previously.
- [x] Misc fixes required for proper fuzz test runs.

Authors:
  - GALI PREM SAGAR (@galipremsagar)

Approvers:
  - Vukasin Milovanovic (@vuule)
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)

URL: #7357
  • Loading branch information
galipremsagar authored Feb 11, 2021
1 parent 21d2ce6 commit ebe307e
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 22 deletions.
23 changes: 17 additions & 6 deletions python/cudf/cudf/_fuzz_testing/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def generate_input(self):
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
)

self._current_params["dtypes_meta"] = dtypes_meta
seed = random.randint(0, 2 ** 32 - 1)
self._current_params["seed"] = seed
Expand Down Expand Up @@ -106,7 +107,6 @@ def set_rand_params(self, params):
elif param == "stripes":
f = io.BytesIO(self._current_buffer)
reader = pyorc.Reader(f)
print("READ: ", reader.num_of_stripes)
stripes = [i for i in range(reader.num_of_stripes)]
params_dict[param] = np.random.choice(
[
Expand All @@ -125,10 +125,10 @@ def set_rand_params(self, params):
)
elif param == "use_index":
params_dict[param] = np.random.choice([True, False])
elif param in ("skiprows", "num_rows"):
params_dict[param] = np.random.choice(
[None, self._rand(len(self._df))]
)
elif param in ("skiprows", "num_rows"):
params_dict[param] = np.random.choice(
[None, self._rand(len(self._df))]
)
else:
if not isinstance(values, list):
raise TypeError("values must be of type list")
Expand All @@ -143,12 +143,16 @@ def __init__(
max_rows=100_000,
max_columns=1000,
max_string_length=None,
max_lists_length=None,
max_lists_nesting_depth=None,
):
super().__init__(
dirs=dirs,
max_rows=max_rows,
max_columns=max_columns,
max_string_length=max_string_length,
max_lists_length=max_lists_length,
max_lists_nesting_depth=max_lists_nesting_depth,
)
self._df = None

Expand All @@ -163,11 +167,18 @@ def generate_input(self):
else:
dtypes_list = list(
cudf.utils.dtypes.ALL_TYPES
- {"category"}
# TODO: Remove "bool" from below
# list after following issue is fixed:
# https://github.com/rapidsai/cudf/issues/6763
- {"category", "bool"}
# Following dtypes are not supported by orc
# https://orc.apache.org/specification/ORCv0/
- cudf.utils.dtypes.TIMEDELTA_TYPES
- cudf.utils.dtypes.UNSIGNED_TYPES
# TODO: Remove `DATETIME_TYPES` once
# following bug is fixed:
# https://github.com/rapidsai/cudf/issues/7355
- cudf.utils.dtypes.DATETIME_TYPES
)

dtypes_meta, num_rows, num_cols = _generate_rand_meta(
Expand Down
20 changes: 10 additions & 10 deletions python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
orc_to_pandas,
run_test,
)
from cudf.tests.utils import assert_eq


@pythonfuzz(
Expand All @@ -24,19 +23,16 @@
"use_index": ALL_POSSIBLE_VALUES,
},
)
def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index):
# TODO: Remove skiprows=0 after
# following issue is fixed:
# https://github.com/rapidsai/cudf/issues/6563
skiprows = 0

def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index):
pdf, file_buffer = input_tuple
expected_pdf = pdf.iloc[skiprows:]
if num_rows is not None:
expected_pdf = expected_pdf.head(num_rows)
if skiprows is not None or num_rows is not None:
expected_pdf.reset_index(drop=True, inplace=True)
if columns is not None:
if columns is not None and len(columns) > 0:
# ORC reader picks columns if only
# there are any elements in `columns`
expected_pdf = expected_pdf[columns]
if use_index is False:
expected_pdf.reset_index(drop=True, inplace=True)
Expand All @@ -48,6 +44,7 @@ def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index):
num_rows=num_rows,
use_index=use_index,
)

compare_dataframe(expected_pdf, gdf)


Expand All @@ -61,14 +58,16 @@ def orc_reader_stripes_test(input_tuple, columns, stripes):
file_io_obj=io.BytesIO(file_buffer), stripes=stripes
)

if columns is not None:
if columns is not None and len(columns) > 0:
# ORC reader picks columns if only
# there are any elements in `columns`
expected_pdf = expected_pdf[columns]

gdf = cudf.read_orc(
io.BytesIO(file_buffer), columns=columns, stripes=stripes
)

assert_eq(expected_pdf, gdf, check_dtype=False)
compare_dataframe(expected_pdf, gdf)


@pythonfuzz(
Expand All @@ -91,6 +90,7 @@ def orc_writer_test(pdf, compression, enable_statistics):
file_to_strore.seek(0)

actual_df = cudf.read_orc(file_to_strore)

compare_dataframe(pdf, actual_df)


Expand Down
33 changes: 27 additions & 6 deletions python/cudf/cudf/_fuzz_testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@
np.dtype("<M8[us]"): pyorc.Timestamp(),
}

ORC_TO_PANDAS_TYPES = {
pyorc.TinyInt().name: pd.Int8Dtype(),
pyorc.Int().name: pd.Int32Dtype(),
pyorc.Boolean().name: pd.BooleanDtype(),
pyorc.SmallInt().name: pd.Int16Dtype(),
pyorc.BigInt().name: pd.Int64Dtype(),
pyorc.String().name: np.dtype("O"),
pyorc.Float().name: np.dtype("float32"),
pyorc.Double().name: np.dtype("float64"),
pyorc.Timestamp().name: np.dtype("<M8[ns]"),
}


def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
obj._current_params = {}
Expand All @@ -73,7 +85,8 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
if null_frequency_override is None
else null_frequency_override
)
cardinality = obj._rand(obj._max_rows)
# `cardinality` has to be at least 1.
cardinality = max(1, obj._rand(obj._max_rows))
meta = dict()
if dtype == "str":
# We want to operate near the limits of string column
Expand Down Expand Up @@ -190,7 +203,8 @@ def get_avro_schema(df):

def get_orc_schema(df):
ordered_dict = OrderedDict(
(col_name, col_dtype) for col_name, col_dtype in df.dtypes.items()
(col_name, get_orc_dtype_info(col_dtype))
for col_name, col_dtype in df.dtypes.items()
)

schema = pyorc.Struct(**ordered_dict)
Expand Down Expand Up @@ -269,13 +283,11 @@ def pandas_to_orc(df, file_name=None, file_io_obj=None, stripe_size=67108864):

if file_name is not None:
with open(file_name, "wb") as data:
with pyorc.Writer(
data, str(schema), stripe_size=stripe_size
) as writer:
with pyorc.Writer(data, schema, stripe_size=stripe_size) as writer:
writer.writerows(tuple_list)
elif file_io_obj is not None:
with pyorc.Writer(
file_io_obj, str(schema), stripe_size=stripe_size
file_io_obj, schema, stripe_size=stripe_size
) as writer:
writer.writerows(tuple_list)

Expand All @@ -288,6 +300,11 @@ def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None):

reader = pyorc.Reader(f)

dtypes = {
col: ORC_TO_PANDAS_TYPES[pyorc_type.name]
for col, pyorc_type in reader.schema.fields.items()
}

if stripes is None:
df = pd.DataFrame.from_records(
reader, columns=reader.schema.fields.keys()
Expand All @@ -300,6 +317,10 @@ def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None):
records, columns=reader.schema.fields.keys()
)

# Need to type-cast to extracted `dtypes` from pyorc schema because
# a fully empty/ full <NA> can result in incorrect dtype by pandas.
df = df.astype(dtypes)

return df


Expand Down

0 comments on commit ebe307e

Please sign in to comment.