From 90f89a19ee93ee71e71a435942411f97a6b6640a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Sat, 4 Sep 2021 17:15:57 -0700 Subject: [PATCH 1/7] add initial datagenerator struct support --- python/cudf/cudf/testing/dataset_generator.py | 74 +++++++++++++++++-- 1 file changed, 66 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index cdea22a05af..080faea71d1 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -352,6 +352,25 @@ def rand_dataframe( dtype=dtype, ) ) + elif dtype == "struct": + nesting_max_depth = meta["nesting_max_depth"] + nesting_depth = np.random.randint(1, nesting_max_depth) + + # TODO: Fix me + # column_params.append( + # ColumnParameters( + # cardinality=cardinality, + # null_frequency=null_frequency, + # generator=list_generator( + # dtype=value_type, + # size=cardinality, + # nesting_depth=nesting_depth, + # lists_max_length=lists_max_length, + # ), + # is_sorted=False, + # dtype=dtype, + # ) + # ) elif dtype == "decimal64": max_precision = meta.get( "max_precision", cudf.Decimal64Dtype.MAX_PRECISION @@ -535,11 +554,15 @@ def decimal_generator(dtype, size): ) -def get_values_for_nested_data(dtype, lists_max_length): +def get_values_for_nested_data(dtype, lists_max_length=None, size=None): """ Returns list of values based on dtype. """ - cardinality = np.random.randint(0, lists_max_length) + if size is None: + cardinality = np.random.randint(0, lists_max_length) + else: + cardinality = size + dtype = cudf.dtype(dtype) if dtype.kind in ("i", "u"): values = int_generator(dtype=dtype, size=cardinality)() @@ -563,12 +586,7 @@ def get_values_for_nested_data(dtype, lists_max_length): else: raise TypeError(f"Unsupported dtype: {dtype}") - # To ensure numpy arrays are not passed as input to - # list constructor, returning a python list object here. - if isinstance(values, np.ndarray): - return values.tolist() - else: - return values + return values def make_lists(dtype, lists_max_length, nesting_depth, top_level_list): @@ -592,9 +610,26 @@ def make_lists(dtype, lists_max_length, nesting_depth, top_level_list): top_level_list = get_values_for_nested_data( dtype=dtype, lists_max_length=lists_max_length ) + # To ensure numpy arrays are not passed as input to + # list constructor, returning a python list object here. + if isinstance(top_level_list, np.ndarray): + top_level_list = top_level_list.tolist() + return top_level_list +def make_array_for_struct(dtype, size): + """ + Helper to create a pa.array with `size` and `dtype` + for a `StructArray`. + """ + + data = get_values_for_nested_data( + dtype=dtype.type.to_pandas_dtype(), size=size + ) + return pa.array(data, type=dtype.type) + + def get_nested_lists(dtype, size, nesting_depth, lists_max_length): """ Returns a list of nested lists with random nesting @@ -615,6 +650,22 @@ def get_nested_lists(dtype, size, nesting_depth, lists_max_length): return list_of_lists +def get_nested_structs(dtype, size): + """ + Returns a list of arrays with random data + corresponding to the dtype provided. + ``dtype`` here should be a ``cudf.StructDtype`` + """ + list_of_arrays = [] + + for name, col_dtype in dtype.fields.items(): + list_of_arrays.append( + make_array_for_struct(dtype=dtype._typ[name], size=size) + ) + + return list_of_arrays + + def list_generator(dtype, size, nesting_depth, lists_max_length): """ Generator for list data @@ -625,3 +676,10 @@ def list_generator(dtype, size, nesting_depth, lists_max_length): nesting_depth=nesting_depth, lists_max_length=lists_max_length, ) + + +def struct_generator(dtype, size): + """ + Generator for struct data + """ + return lambda: get_nested_structs(dtype=dtype, size=size,) From ab6b0f2a8e7e6b6d06ff48bded236ea931f3cf87 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 7 Sep 2021 23:23:41 -0500 Subject: [PATCH 2/7] implementation details --- python/cudf/cudf/_fuzz_testing/io.py | 6 + python/cudf/cudf/_fuzz_testing/main.py | 4 +- python/cudf/cudf/_fuzz_testing/parquet.py | 4 + python/cudf/cudf/_fuzz_testing/utils.py | 20 +++ python/cudf/cudf/testing/dataset_generator.py | 123 ++++++++++++++---- 5 files changed, 129 insertions(+), 28 deletions(-) diff --git a/python/cudf/cudf/_fuzz_testing/io.py b/python/cudf/cudf/_fuzz_testing/io.py index 1312300f714..193fb4c7f7f 100644 --- a/python/cudf/cudf/_fuzz_testing/io.py +++ b/python/cudf/cudf/_fuzz_testing/io.py @@ -25,6 +25,9 @@ def __init__( max_string_length=None, max_lists_length=None, max_lists_nesting_depth=None, + max_structs_nesting_depth=None, + max_struct_null_frequency=None, + max_struct_types_at_each_level=None, ): dirs = [] if dirs is None else dirs self._inputs = [] @@ -33,6 +36,9 @@ def __init__( self._max_string_length = max_string_length self._max_lists_length = max_lists_length self._max_lists_nesting_depth = max_lists_nesting_depth + self._max_structs_nesting_depth = max_structs_nesting_depth + self._max_struct_null_frequency = max_struct_null_frequency + self._max_struct_types_at_each_level = max_struct_types_at_each_level for i, path in enumerate(dirs): if i == 0 and not os.path.exists(path): diff --git a/python/cudf/cudf/_fuzz_testing/main.py b/python/cudf/cudf/_fuzz_testing/main.py index 7b28a4c4970..e7c0c326676 100644 --- a/python/cudf/cudf/_fuzz_testing/main.py +++ b/python/cudf/cudf/_fuzz_testing/main.py @@ -13,8 +13,8 @@ def __init__(self, func, params=None, data_handle=None, **kwargs): dirs=kwargs.get("dir", None), crash_reports_dir=kwargs.get("crash_reports_dir", None), regression=kwargs.get("regression", False), - max_rows_size=kwargs.get("max_rows_size", 100_000), - max_cols_size=kwargs.get("max_cols_size", 1000), + max_rows_size=kwargs.get("max_rows_size", 100), + max_cols_size=kwargs.get("max_cols_size", 50), runs=kwargs.get("runs", -1), max_string_length=kwargs.get("max_string_length", None), params=params, diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py index 5b00f96d88d..ecad18acc81 100644 --- a/python/cudf/cudf/_fuzz_testing/parquet.py +++ b/python/cudf/cudf/_fuzz_testing/parquet.py @@ -59,6 +59,7 @@ def generate_input(self): - {"uint32"} | {"list", "decimal64"} ) + dtypes_list = ["struct"] dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list ) @@ -80,6 +81,9 @@ def generate_input(self): # https://issues.apache.org/jira/browse/ARROW-10123 # file = io.BytesIO() + import pdb + + pdb.set_trace() df.to_parquet("temp_file") # file.seek(0) # self._current_buffer = copy.copy(file.read()) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 0e68f1c71cc..9949d1068c7 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -114,6 +114,26 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): meta["value_type"] = random.choice( list(cudf.utils.dtypes.ALL_TYPES - {"category"}) ) + elif dtype == "struct": + if obj._max_lists_nesting_depth is None: + meta["nesting_max_depth"] = np.random.randint(2, 10) + else: + meta["nesting_max_depth"] = obj._max_lists_nesting_depth + + if obj._max_struct_null_frequency is None: + meta["max_null_frequency"] = random.uniform(0, 1) + else: + meta["max_null_frequency"] = obj._max_struct_null_frequency + + if obj._max_struct_types_at_each_level is None: + meta["max_types_at_each_level"] = np.random.randint( + low=1, high=10 + ) + else: + meta[ + "max_types_at_each_level" + ] = obj._max_struct_types_at_each_level + elif dtype == "decimal64": meta["max_precision"] = cudf.Decimal64Dtype.MAX_PRECISION diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 080faea71d1..5557d9d64b9 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -133,7 +133,25 @@ def _generate_column(column_params, num_rows): else: arrow_type = None - if not isinstance(arrow_type, pa.lib.Decimal128Type): + if isinstance(column_params.dtype, cudf.StructDtype): + vals = pa.StructArray.from_arrays( + column_params.generator, + names=column_params.dtype.fields.keys(), + mask=pa.array( + np.random.choice( + [True, False], + size=num_rows, + p=[ + column_params.null_frequency, + 1 - column_params.null_frequency, + ], + ) + ) + if column_params.null_frequency > 0.0 + else None, + ) + return vals + elif not isinstance(arrow_type, pa.lib.Decimal128Type): vals = pa.array( column_params.generator, size=column_params.cardinality, @@ -354,23 +372,28 @@ def rand_dataframe( ) elif dtype == "struct": nesting_max_depth = meta["nesting_max_depth"] + max_types_at_each_level = meta["max_types_at_each_level"] + max_null_frequency = meta["max_null_frequency"] nesting_depth = np.random.randint(1, nesting_max_depth) + structDtype = create_nested_struct_type( + max_types_at_each_level=max_types_at_each_level, + nesting_level=nesting_depth, + ) - # TODO: Fix me - # column_params.append( - # ColumnParameters( - # cardinality=cardinality, - # null_frequency=null_frequency, - # generator=list_generator( - # dtype=value_type, - # size=cardinality, - # nesting_depth=nesting_depth, - # lists_max_length=lists_max_length, - # ), - # is_sorted=False, - # dtype=dtype, - # ) - # ) + column_params.append( + ColumnParameters( + cardinality=cardinality, + null_frequency=null_frequency, + generator=struct_generator( + dtype=structDtype, + cardinality=cardinality, + size=rows, + max_null_frequency=max_null_frequency, + ), + is_sorted=False, + dtype=structDtype, + ) + ) elif dtype == "decimal64": max_precision = meta.get( "max_precision", cudf.Decimal64Dtype.MAX_PRECISION @@ -482,7 +505,7 @@ def rand_dataframe( df = get_dataframe( Parameters(num_rows=rows, column_parameters=column_params, seed=seed,), - use_threads=use_threads, + use_threads=False, ) return df @@ -618,16 +641,30 @@ def make_lists(dtype, lists_max_length, nesting_depth, top_level_list): return top_level_list -def make_array_for_struct(dtype, size): +def make_array_for_struct(dtype, cardinality, size, max_null_frequency): """ Helper to create a pa.array with `size` and `dtype` for a `StructArray`. """ + null_frequency = np.random.uniform(low=0, high=max_null_frequency) + local_cardinality = max(np.random.randint(low=0, high=cardinality), 1) data = get_values_for_nested_data( - dtype=dtype.type.to_pandas_dtype(), size=size + dtype=dtype.type.to_pandas_dtype(), size=local_cardinality + ) + vals = np.random.choice(data, size=size) + + return pa.array( + vals, + mask=np.random.choice( + [True, False], size=size, p=[null_frequency, 1 - null_frequency], + ) + if null_frequency > 0.0 + else None, + size=size, + safe=False, + type=dtype.type, ) - return pa.array(data, type=dtype.type) def get_nested_lists(dtype, size, nesting_depth, lists_max_length): @@ -650,7 +687,7 @@ def get_nested_lists(dtype, size, nesting_depth, lists_max_length): return list_of_lists -def get_nested_structs(dtype, size): +def get_nested_structs(dtype, cardinality, size, max_null_frequency): """ Returns a list of arrays with random data corresponding to the dtype provided. @@ -659,9 +696,21 @@ def get_nested_structs(dtype, size): list_of_arrays = [] for name, col_dtype in dtype.fields.items(): - list_of_arrays.append( - make_array_for_struct(dtype=dtype._typ[name], size=size) - ) + if isinstance(col_dtype, cudf.StructDtype): + result_arrays = get_nested_structs( + col_dtype, cardinality, size, max_null_frequency + ) + result_arrays = pa.StructArray.from_arrays( + result_arrays, names=col_dtype.fields.keys() + ) + else: + result_arrays = make_array_for_struct( + dtype=dtype._typ[name], + cardinality=cardinality, + size=size, + max_null_frequency=max_null_frequency, + ) + list_of_arrays.append(result_arrays) return list_of_arrays @@ -678,8 +727,30 @@ def list_generator(dtype, size, nesting_depth, lists_max_length): ) -def struct_generator(dtype, size): +def struct_generator(dtype, cardinality, size, max_null_frequency): """ Generator for struct data """ - return lambda: get_nested_structs(dtype=dtype, size=size,) + return lambda: get_nested_structs( + dtype=dtype, + cardinality=cardinality, + size=size, + max_null_frequency=max_null_frequency, + ) + + +def create_nested_struct_type(max_types_at_each_level, nesting_level): + dtypes_list = cudf.utils.dtypes.ALL_TYPES - { + "category", + "datetime64[ns]", + } - cudf.utils.dtypes.TIMEDELTA_TYPES - {"uint32"} | {"struct"} + picked_types = np.random.choice(list(dtypes_list), max_types_at_each_level) + type_dict = {} + for name, type_ in enumerate(picked_types): + if type_ == "struct": + type_dict[str(name)] = create_nested_struct_type( + max_types_at_each_level, nesting_level - 1 + ) + else: + type_dict[str(name)] = cudf.dtype(type_) + return cudf.StructDtype(type_dict) From 7b94eca9f6d9b89386cf396b06439cb92fdaa4f6 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 5 Oct 2021 18:06:57 -0700 Subject: [PATCH 3/7] add struct support --- python/cudf/cudf/_fuzz_testing/orc.py | 7 +- .../cudf/_fuzz_testing/tests/fuzz_test_orc.py | 9 ++- python/cudf/cudf/_fuzz_testing/utils.py | 72 +++++++++++++++---- python/cudf/cudf/core/column/struct.py | 12 ++++ python/cudf/cudf/testing/dataset_generator.py | 7 +- python/cudf/cudf/tests/test_orc.py | 2 +- 6 files changed, 91 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index 2aa01eb3967..d9b8173fcb0 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -63,7 +63,7 @@ def generate_input(self): - cudf.utils.dtypes.UNSIGNED_TYPES - {"datetime64[ns]"} ) - + dtypes_list = ["struct"] dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list ) @@ -83,7 +83,10 @@ def generate_input(self): self._df = df file_obj = io.BytesIO() pandas_to_orc( - df, file_io_obj=file_obj, stripe_size=self._rand(len(df)) + df, + file_io_obj=file_obj, + stripe_size=self._rand(len(df)), + arrow_table_schema=table.schema, ) file_obj.seek(0) buf = file_obj.read() diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py index b3fd7e8c5a7..ad2943512fb 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py @@ -18,7 +18,7 @@ data_handle=OrcReader, params={ "columns": ALL_POSSIBLE_VALUES, - "skiprows": ALL_POSSIBLE_VALUES, + "skiprows": [None], "num_rows": ALL_POSSIBLE_VALUES, "use_index": ALL_POSSIBLE_VALUES, }, @@ -44,8 +44,13 @@ def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index): num_rows=num_rows, use_index=use_index, ) + try: + compare_dataframe(expected_pdf, gdf) + except AssertionError: + import pdb - compare_dataframe(expected_pdf, gdf) + pdb.set_trace() + print("abc") @pythonfuzz( diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 9949d1068c7..1151919f0fd 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -6,6 +6,7 @@ import fastavro import numpy as np import pandas as pd +import pyarrow as pa import pyorc import cudf @@ -179,6 +180,8 @@ def pyarrow_to_pandas(table): df[column._name] = pd.Series( column, dtype=pyarrow_dtypes_to_pandas_dtypes[column.type] ) + elif isinstance(column.type, pa.StructType): + df[column._name] = column.to_pandas(integer_object_nulls=True) else: df[column._name] = column.to_pandas() @@ -208,12 +211,21 @@ def get_orc_dtype_info(dtype): if dtype in PANDAS_TO_ORC_TYPES: return PANDAS_TO_ORC_TYPES[dtype] else: + # import pdb;pdb.set_trace() raise TypeError( f"Unsupported dtype({dtype}) according to orc spec:" f" https://orc.apache.org/specification/" ) +def get_arrow_dtype_info_for_pyorc(dtype): + if isinstance(dtype, pa.StructType): + return get_orc_schema(df=None, arrow_table_schema=dtype) + else: + pd_dtype = cudf.dtype(dtype.to_pandas_dtype()) + return get_orc_dtype_info(pd_dtype) + + def get_avro_schema(df): fields = [ {"name": col_name, "type": get_avro_dtype_info(col_dtype)} @@ -223,11 +235,17 @@ def get_avro_schema(df): return schema -def get_orc_schema(df): - ordered_dict = OrderedDict( - (col_name, get_orc_dtype_info(col_dtype)) - for col_name, col_dtype in df.dtypes.items() - ) +def get_orc_schema(df, arrow_table_schema=None): + if arrow_table_schema is None: + ordered_dict = OrderedDict( + (col_name, get_orc_dtype_info(col_dtype)) + for col_name, col_dtype in df.dtypes.items() + ) + else: + ordered_dict = OrderedDict( + (field.name, get_arrow_dtype_info_for_pyorc(field.type)) + for field in arrow_table_schema + ) schema = pyorc.Struct(**ordered_dict) return schema @@ -273,13 +291,25 @@ def pandas_to_avro(df, file_name=None, file_io_obj=None): fastavro.writer(file_io_obj, avro_schema, records) -def _preprocess_to_orc_tuple(df): +def _preprocess_to_orc_tuple(df, arrow_table_schema): def _null_to_None(value): if value is pd.NA or value is pd.NaT: return None else: return value + def sanitize(value, struct_type): + if value is None: + return None + # import pdb;pdb.set_trace() + values_list = [] + for name, sub_type in struct_type.fields.items(): + if isinstance(sub_type, cudf.StructDtype): + values_list.append(sanitize(value[name], sub_type)) + else: + values_list.append(value[name]) + return tuple(values_list) + has_nulls_or_nullable_dtype = any( [ True @@ -289,20 +319,38 @@ def _null_to_None(value): for col in df.columns ] ) + pdf = df.copy(deep=True) + for field in arrow_table_schema: + if isinstance(field.type, pa.StructType): + # import pdb;pdb.set_trace() + pdf[field.name] = pdf[field.name].apply( + sanitize, args=(cudf.StructDtype.from_arrow(field.type),) + ) + else: + pdf[field.name] = pdf[field.name] tuple_list = [ tuple(map(_null_to_None, tup)) if has_nulls_or_nullable_dtype else tup - for tup in df.itertuples(index=False, name=None) + for tup in pdf.itertuples(index=False, name=None) ] - return tuple_list - + return tuple_list, pdf, df -def pandas_to_orc(df, file_name=None, file_io_obj=None, stripe_size=67108864): - schema = get_orc_schema(df) - tuple_list = _preprocess_to_orc_tuple(df) +def pandas_to_orc( + df, + file_name=None, + file_io_obj=None, + stripe_size=67108864, + arrow_table_schema=None, +): + # import pdb;pdb.set_trace() + schema = get_orc_schema(df, arrow_table_schema=arrow_table_schema) + tuple_list, pdf, df = _preprocess_to_orc_tuple( + df, arrow_table_schema=arrow_table_schema + ) + # import pdb;pdb.set_trace() if file_name is not None: with open(file_name, "wb") as data: with pyorc.Writer(data, schema, stripe_size=stripe_size) as writer: diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 7167918d14d..2680c241b6a 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -1,6 +1,7 @@ # Copyright (c) 2020, NVIDIA CORPORATION. from __future__ import annotations +import pandas as pd import pyarrow as pa import cudf @@ -80,6 +81,17 @@ def to_arrow(self): pa_type, len(self), buffers, children=children ) + def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series": + nullable = kwargs.get("nullable", False) + if nullable: + kwargs["integer_object_nulls"] = True + # pd_series = pd.Series(self.to_arrow().tolist()) + pd_series = self.to_arrow().to_pandas(kwargs) + + if index is not None: + pd_series.index = index + return pd_series + def __getitem__(self, args): result = super().__getitem__(args) if isinstance(result, dict): diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index bed0c554b90..c14694b8a86 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -776,7 +776,12 @@ def create_nested_struct_type(max_types_at_each_level, nesting_level): dtypes_list = cudf.utils.dtypes.ALL_TYPES - { "category", "datetime64[ns]", - } - cudf.utils.dtypes.TIMEDELTA_TYPES - {"uint32"} | {"struct"} + "str", + } - cudf.utils.dtypes.TIMEDELTA_TYPES - { + "uint32" + } - cudf.utils.dtypes.UNSIGNED_TYPES | { + "struct" + } picked_types = np.random.choice(list(dtypes_list), max_types_at_each_level) type_dict = {} for name, type_ in enumerate(picked_types): diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 1230b4b35f3..66e04d1d945 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -941,7 +941,7 @@ def generate_list_struct_buff(size=100_000): "struct_nests_list": struct_nests_list, } ) - + # import pdb;pdb.set_trace() writer = po.Writer(buff, schema, stripe_size=1024) tuples = list( map( From 5f3e1f8f006971bd8d5568cafa837432304024df Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 19 Jan 2022 09:07:22 -0800 Subject: [PATCH 4/7] add struct.to_pandas() --- python/cudf/cudf/core/column/struct.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 2680c241b6a..a09f9296135 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -82,12 +82,7 @@ def to_arrow(self): ) def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series": - nullable = kwargs.get("nullable", False) - if nullable: - kwargs["integer_object_nulls"] = True - # pd_series = pd.Series(self.to_arrow().tolist()) - pd_series = self.to_arrow().to_pandas(kwargs) - + pd_series = pd.Series(self.to_arrow().tolist(), dtype="object") if index is not None: pd_series.index = index return pd_series From 9708092f1618f5c9b6c656617c63b3179ec64ec8 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 19 Jan 2022 09:11:44 -0800 Subject: [PATCH 5/7] resolve conflicts --- python/cudf/cudf/core/column/struct.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index a09f9296135..f0d02a706e2 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -82,7 +82,11 @@ def to_arrow(self): ) def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series": + # We cannot go via Arrow's `to_pandas` because of the following issue: + # https://issues.apache.org/jira/browse/ARROW-12680 + pd_series = pd.Series(self.to_arrow().tolist(), dtype="object") + if index is not None: pd_series.index = index return pd_series From 3c1447dfff87bda158c74ba1e4cbb3e53f443150 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 19 Jan 2022 09:16:49 -0800 Subject: [PATCH 6/7] cleanup --- python/cudf/cudf/_fuzz_testing/main.py | 4 ++-- python/cudf/cudf/_fuzz_testing/orc.py | 2 +- python/cudf/cudf/_fuzz_testing/parquet.py | 4 +--- python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py | 7 +------ python/cudf/cudf/_fuzz_testing/utils.py | 7 ++----- python/cudf/cudf/testing/dataset_generator.py | 2 +- python/cudf/cudf/tests/test_orc.py | 2 +- 7 files changed, 9 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/_fuzz_testing/main.py b/python/cudf/cudf/_fuzz_testing/main.py index e7c0c326676..7b28a4c4970 100644 --- a/python/cudf/cudf/_fuzz_testing/main.py +++ b/python/cudf/cudf/_fuzz_testing/main.py @@ -13,8 +13,8 @@ def __init__(self, func, params=None, data_handle=None, **kwargs): dirs=kwargs.get("dir", None), crash_reports_dir=kwargs.get("crash_reports_dir", None), regression=kwargs.get("regression", False), - max_rows_size=kwargs.get("max_rows_size", 100), - max_cols_size=kwargs.get("max_cols_size", 50), + max_rows_size=kwargs.get("max_rows_size", 100_000), + max_cols_size=kwargs.get("max_cols_size", 1000), runs=kwargs.get("runs", -1), max_string_length=kwargs.get("max_string_length", None), params=params, diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index d9b8173fcb0..78e01fb76a4 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -63,7 +63,7 @@ def generate_input(self): - cudf.utils.dtypes.UNSIGNED_TYPES - {"datetime64[ns]"} ) - dtypes_list = ["struct"] + dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list ) diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py index ecad18acc81..859d09b407f 100644 --- a/python/cudf/cudf/_fuzz_testing/parquet.py +++ b/python/cudf/cudf/_fuzz_testing/parquet.py @@ -59,7 +59,7 @@ def generate_input(self): - {"uint32"} | {"list", "decimal64"} ) - dtypes_list = ["struct"] + dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list ) @@ -81,9 +81,7 @@ def generate_input(self): # https://issues.apache.org/jira/browse/ARROW-10123 # file = io.BytesIO() - import pdb - pdb.set_trace() df.to_parquet("temp_file") # file.seek(0) # self._current_buffer = copy.copy(file.read()) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py index ad2943512fb..9089a40b89e 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py @@ -44,13 +44,8 @@ def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index): num_rows=num_rows, use_index=use_index, ) - try: - compare_dataframe(expected_pdf, gdf) - except AssertionError: - import pdb - pdb.set_trace() - print("abc") + compare_dataframe(expected_pdf, gdf) @pythonfuzz( diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 5480781d9c4..87a8fc46374 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -213,7 +213,6 @@ def get_orc_dtype_info(dtype): if dtype in PANDAS_TO_ORC_TYPES: return PANDAS_TO_ORC_TYPES[dtype] else: - # import pdb;pdb.set_trace() raise TypeError( f"Unsupported dtype({dtype}) according to orc spec:" f" https://orc.apache.org/specification/" @@ -303,7 +302,7 @@ def _null_to_None(value): def sanitize(value, struct_type): if value is None: return None - # import pdb;pdb.set_trace() + values_list = [] for name, sub_type in struct_type.fields.items(): if isinstance(sub_type, cudf.StructDtype): @@ -324,7 +323,6 @@ def sanitize(value, struct_type): pdf = df.copy(deep=True) for field in arrow_table_schema: if isinstance(field.type, pa.StructType): - # import pdb;pdb.set_trace() pdf[field.name] = pdf[field.name].apply( sanitize, args=(cudf.StructDtype.from_arrow(field.type),) ) @@ -346,13 +344,12 @@ def pandas_to_orc( stripe_size=67108864, arrow_table_schema=None, ): - # import pdb;pdb.set_trace() schema = get_orc_schema(df, arrow_table_schema=arrow_table_schema) tuple_list, pdf, df = _preprocess_to_orc_tuple( df, arrow_table_schema=arrow_table_schema ) - # import pdb;pdb.set_trace() + if file_name is not None: with open(file_name, "wb") as data: with pyorc.Writer(data, schema, stripe_size=stripe_size) as writer: diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 2aa92322f19..682c4429c7d 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -551,7 +551,7 @@ def rand_dataframe( df = get_dataframe( Parameters(num_rows=rows, column_parameters=column_params, seed=seed,), - use_threads=False, + use_threads=use_threads, ) return df diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 2c297476e91..44812f5aba4 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -945,7 +945,7 @@ def generate_list_struct_buff(size=100_000): "struct_nests_list": struct_nests_list, } ) - # import pdb;pdb.set_trace() + writer = po.Writer(buff, schema, stripe_size=1024) tuples = list( map( From a65d4be131e2cf0f83626579847373216a99effe Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 19 Jan 2022 09:18:31 -0800 Subject: [PATCH 7/7] cleanup --- python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py | 2 +- python/cudf/cudf/testing/dataset_generator.py | 10 +--------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py index 9089a40b89e..b3fd7e8c5a7 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py @@ -18,7 +18,7 @@ data_handle=OrcReader, params={ "columns": ALL_POSSIBLE_VALUES, - "skiprows": [None], + "skiprows": ALL_POSSIBLE_VALUES, "num_rows": ALL_POSSIBLE_VALUES, "use_index": ALL_POSSIBLE_VALUES, }, diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 682c4429c7d..e1c7b42c7a3 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -805,15 +805,7 @@ def struct_generator(dtype, cardinality, size, max_null_frequency): def create_nested_struct_type(max_types_at_each_level, nesting_level): - dtypes_list = cudf.utils.dtypes.ALL_TYPES - { - "category", - "datetime64[ns]", - "str", - } - cudf.utils.dtypes.TIMEDELTA_TYPES - { - "uint32" - } - cudf.utils.dtypes.UNSIGNED_TYPES | { - "struct" - } + dtypes_list = cudf.utils.dtypes.ALL_TYPES picked_types = np.random.choice(list(dtypes_list), max_types_at_each_level) type_dict = {} for name, type_ in enumerate(picked_types):