From 604c16d4b5e6c8e37389d83a2304db2ea0f3df9d Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 24 Jun 2024 16:57:29 +0000 Subject: [PATCH] address more comments --- python/cudf/cudf/_lib/pylibcudf/io/types.pyx | 15 +++-- .../cudf/cudf/pylibcudf_tests/common/utils.py | 66 ++++++++++--------- .../cudf/cudf/pylibcudf_tests/test_copying.py | 61 ++++++----------- 3 files changed, 64 insertions(+), 78 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx index 6ddaba4ceae..f68628d244f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -178,9 +178,7 @@ cdef class SinkInfo: Parameters ---------- - sinks : List[Union[str, os.PathLike, - io.BytesIO, io.IOBase, - io.StringIO, io.TextIOBase]] + sinks : list of str, PathLike, BytesIO, StringIO A homogeneous list of sinks (this can be a string filename, bytes, or one of the Python I/O classes) to read from. @@ -231,10 +229,17 @@ cdef class SinkInfo: ) data_sinks.push_back(self.sink_storage.back().get()) self.c_obj = sink_info(data_sinks) - elif isinstance(sinks[0], (basestring, os.PathLike)): + elif isinstance(sinks[0], str): paths.reserve(len(sinks)) for s in sinks: - if not isinstance(s, (basestring, os.PathLike)): + if not isinstance(s, str): + raise ValueError("All sinks must be of the same type!") + paths.push_back( s.encode()) + self.c_obj = sink_info(move(paths)) + elif isinstance(sinks[0], os.PathLike): + paths.reserve(len(sinks)) + for s in sinks: + if not isinstance(s, os.PathLike): raise ValueError("All sinks must be of the same type!") paths.push_back( os.path.expanduser(s).encode()) self.c_obj = sink_info(move(paths)) diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index adc2d1fb8f8..d8e8a398b28 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -10,18 +10,22 @@ from cudf._lib import pylibcudf as plc -def metadata_from_arrow_array( - pa_array: pa.Array, +def metadata_from_arrow_type( + pa_type: pa.Array, + name: str = "", ) -> plc.interop.ColumnMetadata | None: - metadata = None - if pa.types.is_list(dtype := pa_array.type) or pa.types.is_struct(dtype): + metadata = plc.interop.ColumnMetadata(name) # None + if pa.types.is_list(pa_type) or pa.types.is_struct(pa_type): + child_meta = [] + for i in range(pa_type.num_fields): + field_meta = metadata_from_arrow_type( + pa_type.field(i).type, pa_type.field(i).name + ) + child_meta.append(field_meta) metadata = plc.interop.ColumnMetadata( - "", + name, # libcudf does not store field names, so just match pyarrow's. - [ - plc.interop.ColumnMetadata(pa_array.type.field(i).name) - for i in range(pa_array.type.num_fields) - ], + child_meta, ) return metadata @@ -35,13 +39,13 @@ def assert_column_eq( rhs, plc.Column ): rhs = plc.interop.to_arrow( - rhs, metadata=metadata_from_arrow_array(lhs) + rhs, metadata=metadata_from_arrow_type(lhs.type) ) elif isinstance(lhs, plc.Column) and isinstance( rhs, (pa.Array, pa.ChunkedArray) ): lhs = plc.interop.to_arrow( - lhs, metadata=metadata_from_arrow_array(rhs) + lhs, metadata=metadata_from_arrow_type(rhs.type) ) else: raise ValueError( @@ -97,21 +101,16 @@ def is_signed_integer(plc_dtype: plc.DataType): ) -def is_unsigned_integer(plc_dtype: plc.DataType): - return plc_dtype.id() in ( - plc.TypeId.UINT8, - plc.TypeId.UINT16, - plc.TypeId.UINT32, - plc.TypeId.UINT64, - ) - - def is_integer(plc_dtype: plc.DataType): return plc_dtype.id() in ( plc.TypeId.INT8, plc.TypeId.INT16, plc.TypeId.INT32, plc.TypeId.INT64, + plc.TypeId.UINT8, + plc.TypeId.UINT16, + plc.TypeId.UINT32, + plc.TypeId.UINT64, ) @@ -133,24 +132,29 @@ def is_string(plc_dtype: plc.DataType): def is_fixed_width(plc_dtype: plc.DataType): return ( is_integer(plc_dtype) - or is_unsigned_integer(plc_dtype) or is_floating(plc_dtype) or is_boolean(plc_dtype) ) -def is_nested_struct(pa_type: pa.DataType): - if isinstance(pa_type, pa.StructType): - for i in range(pa_type.num_fields): - if isinstance(pa_type[i].type, pa.StructType): - return True - return False +def nesting(typ) -> tuple[int, int]: + """Return list and struct nesting of a pyarrow type.""" + if isinstance(typ, pa.ListType): + list_, struct = nesting(typ.value_type) + return list_ + 1, struct + elif isinstance(typ, pa.StructType): + lists, structs = map(max, zip(*(nesting(t.type) for t in typ))) + return lists, structs + 1 + else: + return 0, 0 + + +def is_nested_struct(typ): + return nesting(typ)[1] > 1 -def is_nested_list(pa_type: pa.DataType): - if isinstance(pa_type, pa.ListType): - return isinstance(pa_type.value_type, pa.ListType) - return False +def is_nested_list(typ): + return nesting(typ)[0] > 1 def sink_to_str(sink): diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py index fbe757c1d52..741d6515c47 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_copying.py +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -14,7 +14,7 @@ is_nested_list, is_nested_struct, is_string, - metadata_from_arrow_array, + metadata_from_arrow_type, ) from cudf._lib import pylibcudf as plc @@ -32,7 +32,7 @@ def nested_struct_xfail(request): request.applymarker( pytest.mark.xfail( condition=any( - [is_nested_struct(col.type) for col in pa_table.columns] + is_nested_struct(col.type) for col in pa_table.columns ), reason="pylibcudf interop fails with nested struct", ) @@ -42,9 +42,9 @@ def nested_struct_xfail(request): or "input_column" in request.fixturenames ): # Return value is tuple of (engine, precision) - try: + if "target_column" in request.fixturenames: pa_col, _ = request.getfixturevalue("target_column") - except pytest.FixtureLookupError: + else: pa_col, _ = request.getfixturevalue("input_column") request.applymarker( pytest.mark.xfail( @@ -63,7 +63,14 @@ def nested_list_skip(request): """ if "target_table" in request.fixturenames: pa_table, _ = request.getfixturevalue("target_table") - if any([is_nested_list(col.type) for col in pa_table.columns]): + if any(is_nested_list(col.type) for col in pa_table.columns): + pytest.skip(reason="pylibcudf interop fails with nested list") + elif "target_column" or "input_column" in request.fixturenames: + if "target_column" in request.fixturenames: + pa_col, _ = request.getfixturevalue("target_column") + else: + pa_col, _ = request.getfixturevalue("input_column") + if is_nested_list(pa_col.type): pytest.skip(reason="pylibcudf interop fails with nested list") @@ -221,7 +228,6 @@ def mask(target_column): @skip_nested_list -@xfail_nested_struct def test_gather(target_table, index_column): pa_target_table, plc_target_table = target_table pa_index_column, plc_index_column = index_column @@ -402,7 +408,6 @@ def test_scatter_table_type_mismatch(source_table, index_column, target_table): @skip_nested_list -@xfail_nested_struct def test_scatter_scalars( source_scalar, index_column, @@ -680,16 +685,12 @@ def test_shift_type_mismatch(target_column): plc.copying.shift(plc_target_column, 2, fill_value) -@xfail_nested_struct +@skip_nested_list def test_slice_column(target_column): pa_target_column, plc_target_column = target_column bounds = list(range(6)) upper_bounds = bounds[1::2] lower_bounds = bounds[::2] - if is_nested_list(pa_target_column.type): - pytest.skip( - reason="pylibcudf interop fails with memoryerror/segfault", - ) result = plc.copying.slice(plc_target_column, bounds) for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): assert_column_eq(pa_target_column[lb:ub], slice_) @@ -714,7 +715,6 @@ def test_slice_column_out_of_bounds(target_column): @skip_nested_list -@xfail_nested_struct def test_slice_table(target_table): pa_target_table, plc_target_table = target_table bounds = list(range(6)) @@ -725,15 +725,11 @@ def test_slice_table(target_table): assert_table_eq(pa_target_table[lb:ub], slice_) -@xfail_nested_struct +@skip_nested_list def test_split_column(target_column): upper_bounds = [1, 3, 5] lower_bounds = [0] + upper_bounds[:-1] pa_target_column, plc_target_column = target_column - if is_nested_list(pa_target_column.type): - pytest.skip( - reason="pylibcudf interop fails with memoryerror/segfault", - ) result = plc.copying.split(plc_target_column, upper_bounds) for lb, ub, split in zip(lower_bounds, upper_bounds, result): assert_column_eq(pa_target_column[lb:ub], split) @@ -752,7 +748,6 @@ def test_split_column_out_of_bounds(target_column): @skip_nested_list -@xfail_nested_struct def test_split_table(target_table): pa_target_table, plc_target_table = target_table @@ -763,7 +758,7 @@ def test_split_table(target_table): assert_table_eq(pa_target_table[lb:ub], split) -@xfail_nested_struct +@skip_nested_list def test_copy_if_else_column_column(target_column, mask, source_scalar): pa_target_column, plc_target_column = target_column pa_source_scalar, _ = source_scalar @@ -774,11 +769,6 @@ def test_copy_if_else_column_column(target_column, mask, source_scalar): ) plc_other_column = plc.interop.from_arrow(pa_other_column) - if is_nested_list(pa_target_column.type): - pytest.skip( - reason="pylibcudf interop fails with memoryerror/segfault", - ) - result = plc.copying.copy_if_else( plc_target_column, plc_other_column, @@ -843,7 +833,7 @@ def test_copy_if_else_wrong_size_mask(target_column): ) -@xfail_nested_struct +@skip_nested_list @pytest.mark.parametrize("array_left", [True, False]) def test_copy_if_else_column_scalar( target_column, @@ -855,11 +845,6 @@ def test_copy_if_else_column_scalar( pa_source_scalar, plc_source_scalar = source_scalar pa_mask, plc_mask = mask - if is_nested_list(pa_target_column.type): - pytest.skip( - reason="pylibcudf interop fails with memoryerror/segfault", - ) - args = ( (plc_target_column, plc_source_scalar) if array_left @@ -1001,7 +986,7 @@ def test_boolean_mask_scatter_from_wrong_mask_type(source_table, target_table): ) -@xfail_nested_struct +@skip_nested_list def test_boolean_mask_scatter_from_scalars( source_scalar, target_table, @@ -1010,10 +995,6 @@ def test_boolean_mask_scatter_from_scalars( pa_source_scalar, plc_source_scalar = source_scalar pa_target_table, plc_target_table = target_table pa_mask, plc_mask = mask - if any([is_nested_list(col.type) for col in pa_target_table.columns]): - pytest.skip( - reason="pylibcudf interop fails with memoryerror/segfault", - ) result = plc.copying.boolean_mask_scatter( [plc_source_scalar] * 3, plc_target_table, @@ -1029,19 +1010,15 @@ def test_boolean_mask_scatter_from_scalars( assert_table_eq(expected, result) -@xfail_nested_struct +@skip_nested_list def test_get_element(input_column): index = 1 pa_input_column, plc_input_column = input_column - if is_nested_list(pa_input_column.type): - pytest.skip( - reason="pylibcudf interop fails with memoryerror/segfault", - ) result = plc.copying.get_element(plc_input_column, index) assert ( plc.interop.to_arrow( - result, metadata_from_arrow_array(pa_input_column) + result, metadata_from_arrow_type(pa_input_column.type) ).as_py() == pa_input_column[index].as_py() )