diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py index cd70ce4abf5..da3ca3a6d1e 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_copying.py +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -20,121 +20,104 @@ # TODO: Test nullable data @pytest.fixture(scope="module") -def pa_input_column(pa_type): +def input_column(pa_type): if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): - return pa.array([1, 2, 3], type=pa_type) + pa_array = pa.array([1, 2, 3], type=pa_type) elif pa.types.is_string(pa_type): - return pa.array(["a", "b", "c"], type=pa_type) + pa_array = pa.array(["a", "b", "c"], type=pa_type) elif pa.types.is_boolean(pa_type): - return pa.array([True, True, False], type=pa_type) + pa_array = pa.array([True, True, False], type=pa_type) elif pa.types.is_list(pa_type): # TODO: Add heterogenous sizes - return pa.array([[1], [2], [3]], type=pa_type) + pa_array = pa.array([[1], [2], [3]], type=pa_type) elif pa.types.is_struct(pa_type): - return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) - raise ValueError("Unsupported type") - - -@pytest.fixture(scope="module") -def input_column(pa_input_column): - return plc.interop.from_arrow(pa_input_column) + pa_array = pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) + else: + raise ValueError("Unsupported type") + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture(scope="module") -def pa_index_column(): +def index_column(): # Index column for testing gather/scatter, always integral. - return pa.array([1, 2, 3]) - - -@pytest.fixture(scope="module") -def index_column(pa_index_column): - return plc.interop.from_arrow(pa_index_column) + pa_array = pa.array([1, 2, 3]) + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture(scope="module") -def pa_target_column(pa_type): +def target_column(pa_type): if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): - return pa.array([4, 5, 6, 7, 8, 9], type=pa_type) + pa_array = pa.array([4, 5, 6, 7, 8, 9], type=pa_type) elif pa.types.is_string(pa_type): - return pa.array(["d", "e", "f", "g", "h", "i"], type=pa_type) + pa_array = pa.array(["d", "e", "f", "g", "h", "i"], type=pa_type) elif pa.types.is_boolean(pa_type): - return pa.array([False, True, True, False, True, False], type=pa_type) + pa_array = pa.array( + [False, True, True, False, True, False], type=pa_type + ) elif pa.types.is_list(pa_type): # TODO: Add heterogenous sizes - return pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type) + pa_array = pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type) elif pa.types.is_struct(pa_type): - return pa.array( + pa_array = pa.array( [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}], type=pa_type, ) - raise ValueError("Unsupported type") - - -@pytest.fixture(scope="module") -def target_column(pa_target_column): - return plc.interop.from_arrow(pa_target_column) + else: + raise ValueError("Unsupported type") + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture def mutable_target_column(target_column): - return target_column.copy() + _, plc_target_column = target_column + return plc_target_column.copy() @pytest.fixture(scope="module") -def pa_source_table(pa_input_column): - return pa.table([pa_input_column] * 3, [""] * 3) +def source_table(input_column): + pa_input_column, _ = input_column + pa_table = pa.table([pa_input_column] * 3, [""] * 3) + return pa_table, plc.interop.from_arrow(pa_table) @pytest.fixture(scope="module") -def source_table(pa_source_table): - return plc.interop.from_arrow(pa_source_table) +def target_table(target_column): + pa_target_column, _ = target_column + pa_table = pa.table([pa_target_column] * 3, [""] * 3) + return pa_table, plc.interop.from_arrow(pa_table) @pytest.fixture(scope="module") -def pa_target_table(pa_target_column): - return pa.table([pa_target_column] * 3, [""] * 3) - - -@pytest.fixture(scope="module") -def target_table(pa_target_table): - return plc.interop.from_arrow(pa_target_table) - - -@pytest.fixture(scope="module") -def pa_source_scalar(pa_type): +def source_scalar(pa_type): if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): - return pa.scalar(1, type=pa_type) + pa_scalar = pa.scalar(1, type=pa_type) elif pa.types.is_string(pa_type): - return pa.scalar("a", type=pa_type) + pa_scalar = pa.scalar("a", type=pa_type) elif pa.types.is_boolean(pa_type): - return pa.scalar(False, type=pa_type) + pa_scalar = pa.scalar(False, type=pa_type) elif pa.types.is_list(pa_type): # TODO: Longer list? - return pa.scalar([1], type=pa_type) + pa_scalar = pa.scalar([1], type=pa_type) elif pa.types.is_struct(pa_type): - return pa.scalar({"v": 1}, type=pa_type) - raise ValueError("Unsupported type") - - -@pytest.fixture(scope="module") -def source_scalar(pa_source_scalar): - return plc.interop.from_arrow(pa_source_scalar) - - -@pytest.fixture(scope="module") -def pa_mask(pa_target_column): - return pa.array([True, False] * (len(pa_target_column) // 2)) + pa_scalar = pa.scalar({"v": 1}, type=pa_type) + else: + raise ValueError("Unsupported type") + return pa_scalar, plc.interop.from_arrow(pa_scalar) @pytest.fixture(scope="module") -def mask(pa_mask): - return plc.interop.from_arrow(pa_mask) +def mask(target_column): + pa_target_column, _ = target_column + pa_mask = pa.array([True, False] * (len(pa_target_column) // 2)) + return pa_mask, plc.interop.from_arrow(pa_mask) -def test_gather(target_table, pa_target_table, index_column, pa_index_column): +def test_gather(target_table, index_column): + pa_target_table, plc_target_table = target_table + pa_index_column, plc_index_column = index_column result = plc.copying.gather( - target_table, - index_column, + plc_target_table, + plc_index_column, plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) expected = pa_target_table.take(pa_index_column) @@ -142,10 +125,11 @@ def test_gather(target_table, pa_target_table, index_column, pa_index_column): def test_gather_map_has_nulls(target_table): + _, plc_target_table = target_table gather_map = plc.interop.from_arrow(pa.array([0, 1, None])) with cudf_raises(ValueError): plc.copying.gather( - target_table, + plc_target_table, gather_map, plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) @@ -185,16 +169,16 @@ def _pyarrow_boolean_mask_scatter_table(source, mask, target_table): def test_scatter_table( source_table, - pa_source_table, index_column, - pa_index_column, target_table, - pa_target_table, ): + pa_source_table, plc_source_table = source_table + pa_index_column, plc_index_column = index_column + pa_target_table, plc_target_table = target_table result = plc.copying.scatter( - source_table, - index_column, - target_table, + plc_source_table, + plc_index_column, + plc_target_table, ) if pa.types.is_list( @@ -247,68 +231,80 @@ def test_scatter_table_num_col_mismatch( source_table, index_column, target_table ): # Number of columns in source and target must match. + _, plc_source_table = source_table + _, plc_index_column = index_column + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.scatter( - plc.Table(source_table.columns()[:2]), - index_column, - target_table, + plc.Table(plc_source_table.columns()[:2]), + plc_index_column, + plc_target_table, ) def test_scatter_table_num_row_mismatch(source_table, target_table): # Number of rows in source and scatter map must match. + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.scatter( - source_table, + plc_source_table, plc.interop.from_arrow( - pa.array(range(source_table.num_rows() * 2)) + pa.array(range(plc_source_table.num_rows() * 2)) ), - target_table, + plc_target_table, ) def test_scatter_table_map_has_nulls(source_table, target_table): + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.scatter( - source_table, - plc.interop.from_arrow(pa.array([None] * source_table.num_rows())), - target_table, + plc_source_table, + plc.interop.from_arrow( + pa.array([None] * plc_source_table.num_rows()) + ), + plc_target_table, ) def test_scatter_table_type_mismatch(source_table, index_column, target_table): + _, plc_source_table = source_table + _, plc_index_column = index_column + _, plc_target_table = target_table with cudf_raises(TypeError): if is_integer( - dtype := target_table.columns()[0].type() + dtype := plc_target_table.columns()[0].type() ) or is_floating(dtype): - pa_array = pa.array([True] * source_table.num_rows()) + pa_array = pa.array([True] * plc_source_table.num_rows()) else: - pa_array = pa.array([1] * source_table.num_rows()) - ncol = source_table.num_columns() + pa_array = pa.array([1] * plc_source_table.num_rows()) + ncol = plc_source_table.num_columns() pa_table = pa.table([pa_array] * ncol, [""] * ncol) plc.copying.scatter( plc.interop.from_arrow(pa_table), - index_column, - target_table, + plc_index_column, + plc_target_table, ) def test_scatter_scalars( source_scalar, - pa_source_scalar, index_column, - pa_index_column, target_table, - pa_target_table, ): + pa_source_scalar, plc_source_scalar = source_scalar + pa_index_column, plc_index_column = index_column + pa_target_table, plc_target_table = target_table result = plc.copying.scatter( - [source_scalar] * target_table.num_columns(), - index_column, - target_table, + [plc_source_scalar] * plc_target_table.num_columns(), + plc_index_column, + plc_target_table, ) expected = _pyarrow_boolean_mask_scatter_table( - [pa_source_scalar] * target_table.num_columns(), + [pa_source_scalar] * plc_target_table.num_columns(), pc.invert( _pyarrow_index_to_mask(pa_index_column, pa_target_table.num_rows) ), @@ -321,85 +317,103 @@ def test_scatter_scalars( def test_scatter_scalars_num_scalars_mismatch( source_scalar, index_column, target_table ): + _, plc_source_scalar = source_scalar + _, plc_index_column = index_column + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.scatter( - [source_scalar] * (target_table.num_columns() - 1), - index_column, - target_table, + [plc_source_scalar] * (plc_target_table.num_columns() - 1), + plc_index_column, + plc_target_table, ) def test_scatter_scalars_map_has_nulls(source_scalar, target_table): + _, plc_source_scalar = source_scalar + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.scatter( - [source_scalar] * target_table.num_columns(), + [plc_source_scalar] * plc_target_table.num_columns(), plc.interop.from_arrow(pa.array([None, None])), - target_table, + plc_target_table, ) def test_scatter_scalars_type_mismatch(index_column, target_table): + _, plc_index_column = index_column + _, plc_target_table = target_table with cudf_raises(TypeError): if is_integer( - dtype := target_table.columns()[0].type() + dtype := plc_target_table.columns()[0].type() ) or is_floating(dtype): - source_scalar = [plc.interop.from_arrow(pa.scalar(True))] + plc_source_scalar = [plc.interop.from_arrow(pa.scalar(True))] else: - source_scalar = [plc.interop.from_arrow(pa.scalar(1))] + plc_source_scalar = [plc.interop.from_arrow(pa.scalar(1))] plc.copying.scatter( - source_scalar * target_table.num_columns(), - index_column, - target_table, + plc_source_scalar * plc_target_table.num_columns(), + plc_index_column, + plc_target_table, ) def test_empty_like_column(input_column): - result = plc.copying.empty_like(input_column) - assert result.type() == input_column.type() + _, plc_input_column = input_column + result = plc.copying.empty_like(plc_input_column) + assert result.type() == plc_input_column.type() def test_empty_like_table(source_table): - result = plc.copying.empty_like(source_table) - assert result.num_columns() == source_table.num_columns() - for icol, rcol in zip(source_table.columns(), result.columns()): + _, plc_source_table = source_table + result = plc.copying.empty_like(plc_source_table) + assert result.num_columns() == plc_source_table.num_columns() + for icol, rcol in zip(plc_source_table.columns(), result.columns()): assert rcol.type() == icol.type() @pytest.mark.parametrize("size", [None, 10]) def test_allocate_like(input_column, size): - if is_fixed_width(input_column.type()): + _, plc_input_column = input_column + if is_fixed_width(plc_input_column.type()): result = plc.copying.allocate_like( - input_column, plc.copying.MaskAllocationPolicy.RETAIN, size=size + plc_input_column, + plc.copying.MaskAllocationPolicy.RETAIN, + size=size, + ) + assert result.type() == plc_input_column.type() + assert result.size() == ( + plc_input_column.size() if size is None else size ) - assert result.type() == input_column.type() - assert result.size() == (input_column.size() if size is None else size) else: with pytest.raises(TypeError): plc.copying.allocate_like( - input_column, + plc_input_column, plc.copying.MaskAllocationPolicy.RETAIN, size=size, ) def test_copy_range_in_place( - input_column, pa_input_column, mutable_target_column, pa_target_column + input_column, mutable_target_column, target_column ): + pa_input_column, plc_input_column = input_column + + pa_target_column, _ = target_column + if not is_fixed_width(mutable_target_column.type()): with pytest.raises(TypeError): plc.copying.copy_range_in_place( - input_column, + plc_input_column, mutable_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) else: plc.copying.copy_range_in_place( - input_column, + plc_input_column, mutable_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) expected = _pyarrow_boolean_mask_scatter_column( @@ -415,36 +429,40 @@ def test_copy_range_in_place( def test_copy_range_in_place_out_of_bounds( input_column, mutable_target_column ): + _, plc_input_column = input_column + if is_fixed_width(mutable_target_column.type()): with cudf_raises(IndexError): plc.copying.copy_range_in_place( - input_column, + plc_input_column, mutable_target_column, 5, - 5 + input_column.size(), + 5 + plc_input_column.size(), 0, ) def test_copy_range_in_place_different_types(mutable_target_column): if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype): - input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) + plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: - input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) + plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) with cudf_raises(TypeError): plc.copying.copy_range_in_place( - input_column, + plc_input_column, mutable_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) def test_copy_range_in_place_null_mismatch( - pa_input_column, mutable_target_column + input_column, mutable_target_column ): + pa_input_column, _ = input_column + if is_fixed_width(mutable_target_column.type()): pa_input_column = pc.if_else( _pyarrow_index_to_mask([0], len(pa_input_column)), @@ -462,15 +480,15 @@ def test_copy_range_in_place_null_mismatch( ) -def test_copy_range( - input_column, pa_input_column, target_column, pa_target_column -): - if is_fixed_width(dtype := target_column.type()) or is_string(dtype): +def test_copy_range(input_column, target_column): + pa_input_column, plc_input_column = input_column + pa_target_column, plc_target_column = target_column + if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype): result = plc.copying.copy_range( - input_column, - target_column, + plc_input_column, + plc_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) expected = _pyarrow_boolean_mask_scatter_column( @@ -484,137 +502,152 @@ def test_copy_range( else: with pytest.raises(TypeError): plc.copying.copy_range( - input_column, - target_column, + plc_input_column, + plc_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) def test_copy_range_out_of_bounds(input_column, target_column): + _, plc_input_column = input_column + _, plc_target_column = target_column with cudf_raises(IndexError): plc.copying.copy_range( - input_column, - target_column, + plc_input_column, + plc_target_column, 5, - 5 + input_column.size(), + 5 + plc_input_column.size(), 0, ) def test_copy_range_different_types(target_column): - if is_integer(dtype := target_column.type()) or is_floating(dtype): - input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) + _, plc_target_column = target_column + if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: - input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) + plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) with cudf_raises(TypeError): plc.copying.copy_range( - input_column, - target_column, + plc_input_column, + plc_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) -def test_shift( - target_column, pa_target_column, source_scalar, pa_source_scalar -): +def test_shift(target_column, source_scalar): + pa_source_scalar, plc_source_scalar = source_scalar + pa_target_column, plc_target_column = target_column shift = 2 - if is_fixed_width(dtype := target_column.type()) or is_string(dtype): - result = plc.copying.shift(target_column, shift, source_scalar) + if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype): + result = plc.copying.shift(plc_target_column, shift, plc_source_scalar) expected = pa.concat_arrays( [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]] ) assert_column_eq(expected, result) else: with pytest.raises(TypeError): - plc.copying.shift(target_column, shift, source_scalar) + plc.copying.shift(plc_target_column, shift, source_scalar) def test_shift_type_mismatch(target_column): - if is_integer(dtype := target_column.type()) or is_floating(dtype): + _, plc_target_column = target_column + if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): fill_value = plc.interop.from_arrow(pa.scalar("a")) else: fill_value = plc.interop.from_arrow(pa.scalar(1)) with cudf_raises(TypeError): - plc.copying.shift(target_column, 2, fill_value) + plc.copying.shift(plc_target_column, 2, fill_value) -def test_slice_column(target_column, pa_target_column): +def test_slice_column(target_column): + pa_target_column, plc_target_column = target_column bounds = list(range(6)) upper_bounds = bounds[1::2] lower_bounds = bounds[::2] - result = plc.copying.slice(target_column, bounds) + result = plc.copying.slice(plc_target_column, bounds) for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): assert_column_eq(pa_target_column[lb:ub], slice_) def test_slice_column_wrong_length(target_column): + _, plc_target_column = target_column with cudf_raises(ValueError): - plc.copying.slice(target_column, list(range(5))) + plc.copying.slice(plc_target_column, list(range(5))) def test_slice_column_decreasing(target_column): + _, plc_target_column = target_column with cudf_raises(ValueError): - plc.copying.slice(target_column, list(range(5, -1, -1))) + plc.copying.slice(plc_target_column, list(range(5, -1, -1))) def test_slice_column_out_of_bounds(target_column): + _, plc_target_column = target_column with cudf_raises(IndexError): - plc.copying.slice(target_column, list(range(2, 8))) + plc.copying.slice(plc_target_column, list(range(2, 8))) -def test_slice_table(target_table, pa_target_table): +def test_slice_table(target_table): + pa_target_table, plc_target_table = target_table bounds = list(range(6)) upper_bounds = bounds[1::2] lower_bounds = bounds[::2] - result = plc.copying.slice(target_table, bounds) + result = plc.copying.slice(plc_target_table, bounds) for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): assert_table_eq(pa_target_table[lb:ub], slice_) -def test_split_column(target_column, pa_target_column): +def test_split_column(target_column): upper_bounds = [1, 3, 5] lower_bounds = [0] + upper_bounds[:-1] - result = plc.copying.split(target_column, upper_bounds) + pa_target_column, plc_target_column = target_column + result = plc.copying.split(plc_target_column, upper_bounds) for lb, ub, split in zip(lower_bounds, upper_bounds, result): assert_column_eq(pa_target_column[lb:ub], split) def test_split_column_decreasing(target_column): + _, plc_target_column = target_column with cudf_raises(ValueError): - plc.copying.split(target_column, list(range(5, -1, -1))) + plc.copying.split(plc_target_column, list(range(5, -1, -1))) def test_split_column_out_of_bounds(target_column): + _, plc_target_column = target_column with cudf_raises(IndexError): - plc.copying.split(target_column, list(range(5, 8))) + plc.copying.split(plc_target_column, list(range(5, 8))) -def test_split_table(target_table, pa_target_table): +def test_split_table(target_table): + pa_target_table, plc_target_table = target_table upper_bounds = [1, 3, 5] lower_bounds = [0] + upper_bounds[:-1] - result = plc.copying.split(target_table, upper_bounds) + result = plc.copying.split(plc_target_table, upper_bounds) for lb, ub, split in zip(lower_bounds, upper_bounds, result): assert_table_eq(pa_target_table[lb:ub], split) -def test_copy_if_else_column_column( - target_column, pa_target_column, pa_source_scalar, mask, pa_mask -): +def test_copy_if_else_column_column(target_column, mask, source_scalar): + pa_target_column, plc_target_column = target_column + pa_source_scalar, _ = source_scalar + pa_mask, plc_mask = mask + pa_other_column = pa.concat_arrays( [pa.array([pa_source_scalar] * 2), pa_target_column[:-2]] ) - other_column = plc.interop.from_arrow(pa_other_column) + plc_other_column = plc.interop.from_arrow(pa_other_column) result = plc.copying.copy_if_else( - target_column, - other_column, - mask, + plc_target_column, + plc_other_column, + plc_mask, ) expected = pc.if_else( @@ -626,46 +659,51 @@ def test_copy_if_else_column_column( def test_copy_if_else_wrong_type(target_column, mask): - if is_integer(dtype := target_column.type()) or is_floating(dtype): - input_column = plc.interop.from_arrow( - pa.array(["a"] * target_column.size()) + _, plc_target_column = target_column + _, plc_mask = mask + if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + plc_input_column = plc.interop.from_arrow( + pa.array(["a"] * plc_target_column.size()) ) else: - input_column = plc.interop.from_arrow( - pa.array([1] * target_column.size()) + plc_input_column = plc.interop.from_arrow( + pa.array([1] * plc_target_column.size()) ) with cudf_raises(TypeError): - plc.copying.copy_if_else(input_column, target_column, mask) + plc.copying.copy_if_else(plc_input_column, plc_target_column, plc_mask) def test_copy_if_else_wrong_type_mask(target_column): + _, plc_target_column = target_column with cudf_raises(TypeError): plc.copying.copy_if_else( - target_column, - target_column, + plc_target_column, + plc_target_column, plc.interop.from_arrow( - pa.array([1.0, 2.0] * (target_column.size() // 2)) + pa.array([1.0, 2.0] * (plc_target_column.size() // 2)) ), ) def test_copy_if_else_wrong_size(target_column): + _, plc_target_column = target_column with cudf_raises(ValueError): plc.copying.copy_if_else( plc.interop.from_arrow(pa.array([1])), - target_column, + plc_target_column, plc.interop.from_arrow( - pa.array([True, False] * (target_column.size() // 2)) + pa.array([True, False] * (plc_target_column.size() // 2)) ), ) def test_copy_if_else_wrong_size_mask(target_column): + _, plc_target_column = target_column with cudf_raises(ValueError): plc.copying.copy_if_else( - target_column, - target_column, + plc_target_column, + plc_target_column, plc.interop.from_arrow(pa.array([True])), ) @@ -673,21 +711,21 @@ def test_copy_if_else_wrong_size_mask(target_column): @pytest.mark.parametrize("array_left", [True, False]) def test_copy_if_else_column_scalar( target_column, - pa_target_column, source_scalar, - pa_source_scalar, array_left, mask, - pa_mask, ): + pa_target_column, plc_target_column = target_column + pa_source_scalar, plc_source_scalar = source_scalar + pa_mask, plc_mask = mask args = ( - (target_column, source_scalar) + (plc_target_column, plc_source_scalar) if array_left - else (source_scalar, target_column) + else (plc_source_scalar, plc_target_column) ) result = plc.copying.copy_if_else( *args, - mask, + plc_mask, ) pa_args = ( @@ -704,16 +742,17 @@ def test_copy_if_else_column_scalar( def test_boolean_mask_scatter_from_table( source_table, - pa_source_table, target_table, - pa_target_table, mask, - pa_mask, ): + pa_source_table, plc_source_table = source_table + pa_target_table, plc_target_table = target_table + pa_mask, plc_mask = mask + result = plc.copying.boolean_mask_scatter( - source_table, - target_table, - mask, + plc_source_table, + plc_target_table, + plc_mask, ) if pa.types.is_list( @@ -757,28 +796,34 @@ def test_boolean_mask_scatter_from_table( def test_boolean_mask_scatter_from_wrong_num_cols(source_table, target_table): + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.boolean_mask_scatter( - plc.Table(source_table.columns()[:2]), - target_table, + plc.Table(plc_source_table.columns()[:2]), + plc_target_table, plc.interop.from_arrow(pa.array([True, False] * 3)), ) def test_boolean_mask_scatter_from_wrong_mask_size(source_table, target_table): + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.boolean_mask_scatter( - source_table, - target_table, + plc_source_table, + plc_target_table, plc.interop.from_arrow(pa.array([True, False] * 2)), ) def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table): + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.boolean_mask_scatter( - plc.Table(source_table.columns()[:2]), - target_table, + plc.Table(plc_source_table.columns()[:2]), + plc_target_table, plc.interop.from_arrow( pa.array([True, False] * 2 + [False, False]) ), @@ -786,44 +831,48 @@ def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table): def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask): - if is_integer(dtype := target_table.columns()[0].type()) or is_floating( - dtype - ): + _, plc_target_table = target_table + _, plc_mask = mask + if is_integer( + dtype := plc_target_table.columns()[0].type() + ) or is_floating(dtype): input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) with cudf_raises(TypeError): plc.copying.boolean_mask_scatter( - plc.Table([input_column] * 3), target_table, mask + plc.Table([input_column] * 3), plc_target_table, plc_mask ) def test_boolean_mask_scatter_from_wrong_mask_type(source_table, target_table): + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(TypeError): plc.copying.boolean_mask_scatter( - source_table, - target_table, + plc_source_table, + plc_target_table, plc.interop.from_arrow(pa.array([1.0, 2.0] * 3)), ) def test_boolean_mask_scatter_from_scalars( source_scalar, - pa_source_scalar, target_table, - pa_target_table, mask, - pa_mask, ): + pa_source_scalar, plc_source_scalar = source_scalar + pa_target_table, plc_target_table = target_table + pa_mask, plc_mask = mask result = plc.copying.boolean_mask_scatter( - [source_scalar] * 3, - target_table, - mask, + [plc_source_scalar] * 3, + plc_target_table, + plc_mask, ) expected = _pyarrow_boolean_mask_scatter_table( - [pa_source_scalar] * target_table.num_columns(), + [pa_source_scalar] * plc_target_table.num_columns(), pc.invert(pa_mask), pa_target_table, ) @@ -831,9 +880,10 @@ def test_boolean_mask_scatter_from_scalars( assert_table_eq(expected, result) -def test_get_element(input_column, pa_input_column): +def test_get_element(input_column): index = 1 - result = plc.copying.get_element(input_column, index) + pa_input_column, plc_input_column = input_column + result = plc.copying.get_element(plc_input_column, index) assert ( plc.interop.to_arrow( @@ -844,5 +894,6 @@ def test_get_element(input_column, pa_input_column): def test_get_element_out_of_bounds(input_column): + _, plc_input_column = input_column with cudf_raises(IndexError): - plc.copying.get_element(input_column, 100) + plc.copying.get_element(plc_input_column, 100) diff --git a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py index a5d332a7795..13f3b037606 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py +++ b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py @@ -19,13 +19,9 @@ @pytest.fixture(scope="module", params=[[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]]) -def pa_col_data(request, numeric_pa_type): - return pa.array(request.param, type=numeric_pa_type) - - -@pytest.fixture(scope="module") -def plc_col_data(pa_col_data): - return plc.interop.from_arrow(pa_col_data) +def col_data(request, numeric_pa_type): + pa_array = pa.array(request.param, type=numeric_pa_type) + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture( @@ -60,7 +56,8 @@ def plc_tbl_data(request): @pytest.mark.parametrize("q", [[], [0], [0.5], [0.1, 0.5, 0.7, 0.9]]) @pytest.mark.parametrize("exact", [True, False]) -def test_quantile(pa_col_data, plc_col_data, interp_opt, q, exact): +def test_quantile(col_data, interp_opt, q, exact): + pa_col_data, plc_col_data = col_data ordered_indices = plc.interop.from_arrow( pc.cast(pc.sort_indices(pa_col_data), pa.int32()) ) @@ -210,7 +207,8 @@ def test_quantiles_invalid_interp(plc_tbl_data, invalid_interp): "q", [[0.1], (0.1,), np.array([0.1])], ) -def test_quantile_q_array_like(pa_col_data, plc_col_data, q): +def test_quantile_q_array_like(col_data, q): + pa_col_data, plc_col_data = col_data ordered_indices = plc.interop.from_arrow( pc.cast(pc.sort_indices(pa_col_data), pa.int32()) ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_reshape.py b/python/cudf/cudf/pylibcudf_tests/test_reshape.py index 32d79257f4f..da1157e5832 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_reshape.py +++ b/python/cudf/cudf/pylibcudf_tests/test_reshape.py @@ -10,20 +10,15 @@ @pytest.fixture(scope="module") def reshape_data(): data = [[1, 2, 3], [4, 5, 6]] - return data + arrow_tbl = pa.Table.from_arrays(data, names=["a", "b"]) + return data, plc.interop.from_arrow(arrow_tbl) -@pytest.fixture(scope="module") -def reshape_plc_tbl(reshape_data): - arrow_tbl = pa.Table.from_arrays(reshape_data, names=["a", "b"]) - plc_tbl = plc.interop.from_arrow(arrow_tbl) - return plc_tbl - - -def test_interleave_columns(reshape_data, reshape_plc_tbl): +def test_interleave_columns(reshape_data): + raw_data, reshape_plc_tbl = reshape_data res = plc.reshape.interleave_columns(reshape_plc_tbl) - interleaved_data = [pa.array(pair) for pair in zip(*reshape_data)] + interleaved_data = [pa.array(pair) for pair in zip(*raw_data)] expect = pa.concat_arrays(interleaved_data) @@ -31,10 +26,11 @@ def test_interleave_columns(reshape_data, reshape_plc_tbl): @pytest.mark.parametrize("cnt", [0, 1, 3]) -def test_tile(reshape_data, reshape_plc_tbl, cnt): +def test_tile(reshape_data, cnt): + raw_data, reshape_plc_tbl = reshape_data res = plc.reshape.tile(reshape_plc_tbl, cnt) - tiled_data = [pa.array(col * cnt) for col in reshape_data] + tiled_data = [pa.array(col * cnt) for col in raw_data] expect = pa.Table.from_arrays( tiled_data, schema=plc.interop.to_arrow(reshape_plc_tbl).schema diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py index 818d6e6e72a..c4e437fe5d9 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py @@ -8,39 +8,38 @@ @pytest.fixture(scope="module") -def pa_data(): - data = [ - "leopard", - "Golden Eagle", - "SNAKE", - "", - "!A", - "hello World", - "A B C", - "#", - "AƻB", - "Ⓑⓖ", - "Art of War", - "The quick bRoWn fox juMps over the laze DOG", - '123nr98nv9rev!$#INF4390v03n1243<>?}{:-"', - "accénted", - None, - ] - return pa.array(data) - - -@pytest.fixture(scope="module") -def plc_data(pa_data): - return plc.interop.from_arrow(pa_data) +def str_data(): + pa_data = pa.array( + [ + "leopard", + "Golden Eagle", + "SNAKE", + "", + "!A", + "hello World", + "A B C", + "#", + "AƻB", + "Ⓑⓖ", + "Art of War", + "The quick bRoWn fox juMps over the laze DOG", + '123nr98nv9rev!$#INF4390v03n1243<>?}{:-"', + "accénted", + None, + ] + ) + return pa_data, plc.interop.from_arrow(pa_data) -def test_capitalize(plc_data, pa_data): +def test_capitalize(str_data): + pa_data, plc_data = str_data got = plc.strings.capitalize.capitalize(plc_data) expected = pa.compute.utf8_capitalize(pa_data) assert_column_eq(expected, got) -def test_title(plc_data, pa_data): +def test_title(str_data): + pa_data, plc_data = str_data got = plc.strings.capitalize.title( plc_data, plc.strings.char_types.StringCharacterTypes.CASE_TYPES ) @@ -48,7 +47,8 @@ def test_title(plc_data, pa_data): assert_column_eq(expected, got) -def test_is_title(plc_data, pa_data): +def test_is_title(str_data): + pa_data, plc_data = str_data got = plc.strings.capitalize.is_title(plc_data) expected = pa.compute.utf8_is_title(pa_data) assert_column_eq(expected, got) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py index 8cdb6f7c521..fc8c6656b5d 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py @@ -8,15 +8,11 @@ @pytest.fixture(scope="module") -def pa_target_col(): - return pa.array( +def target_col(): + pa_array = pa.array( ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"] ) - - -@pytest.fixture(scope="module") -def plc_target_col(pa_target_col): - return plc.interop.from_arrow(pa_target_col) + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture( @@ -45,9 +41,8 @@ def plc_target_pat(pa_target_scalar): return prog -def test_contains_re( - pa_target_col, plc_target_col, pa_target_scalar, plc_target_pat -): +def test_contains_re(target_col, pa_target_scalar, plc_target_pat): + pa_target_col, plc_target_col = target_col got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat) expected = pa.compute.match_substring_regex( pa_target_col, pa_target_scalar.as_py() diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_find.py b/python/cudf/cudf/pylibcudf_tests/test_string_find.py index 44900044184..95a1a3cf731 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_find.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_find.py @@ -8,8 +8,8 @@ @pytest.fixture(scope="module") -def pa_data_col(): - return pa.array( +def data_col(): + pa_array = pa.array( [ "abc123", "ABC123", @@ -53,16 +53,12 @@ def pa_data_col(): None, ] ) + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture(scope="module") -def plc_data_col(pa_data_col): - return plc.interop.from_arrow(pa_data_col) - - -@pytest.fixture(scope="module") -def pa_target_col(): - return pa.array( +def target_col(): + pa_array = pa.array( [ "a", "B", @@ -106,24 +102,18 @@ def pa_target_col(): None, # ends_with ] ) - - -@pytest.fixture(scope="module") -def plc_target_col(pa_target_col): - return plc.interop.from_arrow(pa_target_col) + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture(params=["a", " ", "A", "Ab", "23"], scope="module") -def pa_target_scalar(request): - return pa.scalar(request.param, type=pa.string()) - - -@pytest.fixture(scope="module") -def plc_target_scalar(pa_target_scalar): - return plc.interop.from_arrow(pa_target_scalar) +def target_scalar(request): + pa_scalar = pa.scalar(request.param, type=pa.string()) + return pa_scalar, plc.interop.from_arrow(pa_scalar) -def test_find(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar): +def test_find(data_col, target_scalar): + pa_data_col, plc_data_col = data_col + pa_target_scalar, plc_target_scalar = target_scalar got = plc.strings.find.find(plc_data_col, plc_target_scalar, 0, -1) expected = pa.array( @@ -161,7 +151,9 @@ def handle_none(st, target): return expected -def test_find_column(pa_data_col, pa_target_col, plc_data_col, plc_target_col): +def test_find_column(data_col, target_col): + pa_data_col, plc_data_col = data_col + pa_target_col, plc_target_col = target_col expected = pa.array( [ elem.find(target) if not (elem is None or target is None) else None @@ -177,7 +169,9 @@ def test_find_column(pa_data_col, pa_target_col, plc_data_col, plc_target_col): assert_column_eq(expected, got) -def test_rfind(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar): +def test_rfind(data_col, target_scalar): + pa_data_col, plc_data_col = data_col + pa_target_scalar, plc_target_scalar = target_scalar py_target = pa_target_scalar.as_py() got = plc.strings.find.rfind(plc_data_col, plc_target_scalar, 0, -1) @@ -195,9 +189,9 @@ def test_rfind(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar): assert_column_eq(expected, got) -def test_contains( - pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar -): +def test_contains(data_col, target_scalar): + pa_data_col, plc_data_col = data_col + pa_target_scalar, plc_target_scalar = target_scalar py_target = pa_target_scalar.as_py() got = plc.strings.find.contains(plc_data_col, plc_target_scalar) @@ -214,9 +208,9 @@ def test_contains( assert_column_eq(expected, got) -def test_contains_column( - pa_data_col, pa_target_col, plc_data_col, plc_target_col -): +def test_contains_column(data_col, target_col): + pa_data_col, plc_data_col = data_col + pa_target_col, plc_target_col = target_col expected = colwise_apply( pa_data_col, pa_target_col, lambda st, target: target in st ) @@ -224,18 +218,18 @@ def test_contains_column( assert_column_eq(expected, got) -def test_starts_with( - pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar -): +def test_starts_with(data_col, target_scalar): + pa_data_col, plc_data_col = data_col + pa_target_scalar, plc_target_scalar = target_scalar py_target = pa_target_scalar.as_py() got = plc.strings.find.starts_with(plc_data_col, plc_target_scalar) expected = pa.compute.starts_with(pa_data_col, py_target) assert_column_eq(expected, got) -def test_starts_with_column( - pa_data_col, pa_target_col, plc_data_col, plc_target_col -): +def test_starts_with_column(data_col, target_col): + pa_data_col, plc_data_col = data_col + pa_target_col, plc_target_col = target_col expected = colwise_apply( pa_data_col, pa_target_col, lambda st, target: st.startswith(target) ) @@ -243,18 +237,18 @@ def test_starts_with_column( assert_column_eq(expected, got) -def test_ends_with( - pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar -): +def test_ends_with(data_col, target_scalar): + pa_data_col, plc_data_col = data_col + pa_target_scalar, plc_target_scalar = target_scalar py_target = pa_target_scalar.as_py() got = plc.strings.find.ends_with(plc_data_col, plc_target_scalar) expected = pa.compute.ends_with(pa_data_col, py_target) assert_column_eq(expected, got) -def test_ends_with_column( - pa_data_col, pa_target_col, plc_data_col, plc_target_col -): +def test_ends_with_column(data_col, target_col): + pa_data_col, plc_data_col = data_col + pa_target_col, plc_target_col = target_col expected = colwise_apply( pa_data_col, pa_target_col, lambda st, target: st.endswith(target) )