diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 70acb7f917b..5d352f2c8eb 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -280,15 +280,30 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, for rg in row_groups[i]: filtered_idx.append( - cudf.RangeIndex( - start=row_groups_i[rg][0], - stop=row_groups_i[rg][1], - step=range_index_meta['step'] + ( + row_groups_i[rg][0], + row_groups_i[rg][1] ) ) - if len(filtered_idx) > 0: - idx = cudf.concat(filtered_idx) + step = range_index_meta['step'] + if len(filtered_idx) == 1: + start, stop = filtered_idx[0] + idx = cudf.RangeIndex( + start=start, stop=stop, step=step + ) + elif len(filtered_idx) > 1: + idx = cudf.Index( + data=[ + n + for start, stop in filtered_idx + for n in range( + start, + stop, + step + ) + ] + ) else: idx = cudf.Index(cudf.core.column.column_empty(0)) else: diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 0b44ab58f30..99286779b0d 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -121,6 +121,8 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): Parameters ---------- objs : list or dictionary of DataFrame, Series, or Index + deprecated:: 24.06 + concatenating indices is deprecated and will be removed in a future version of cudf. axis : {0/'index', 1/'columns'}, default 0 The axis to concatenate along. `axis=1` must be passed if a dictionary is passed. @@ -285,10 +287,16 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): ) if any(isinstance(o, cudf.BaseIndex) for o in objs): + warnings.warn( + "index concatenation will be deprecated in a future release", + FutureWarning, + ) if not all(isinstance(o, cudf.BaseIndex) for o in objs): raise TypeError( "when concatenating indices you must provide ONLY indices" ) + if axis == 1: + raise ValueError("cannot concatenate indices across axis 1") only_series = all(isinstance(o, cudf.Series) for o in objs) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 4b43a33c8c8..fdfb18cafb2 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -1,7 +1,7 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. import warnings -from contextlib import contextmanager +from contextlib import contextmanager, nullcontext from decimal import Decimal import numpy as np @@ -104,7 +104,11 @@ def test_concat_dataframe(index, nulls, axis): ) # Index - res = cudf.concat([gdf.index, gdf2.index], axis=axis).to_pandas() + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + res = cudf.concat([gdf.index, gdf2.index], axis=axis).to_pandas() sol = df.index.append(df2.index) assert_eq(res, sol, check_names=False, check_categorical=False) @@ -151,12 +155,16 @@ def test_concat_errors(): ) # Mismatched types - assert_exceptions_equal( - lfunc=pd.concat, - rfunc=cudf.concat, - lfunc_args_and_kwargs=([], {"objs": [df, df.index, df.x]}), - rfunc_args_and_kwargs=([], {"objs": [gdf, gdf.index, gdf.x]}), - ) + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + assert_exceptions_equal( + lfunc=pd.concat, + rfunc=cudf.concat, + lfunc_args_and_kwargs=([], {"objs": [df, df.index, df.x]}), + rfunc_args_and_kwargs=([], {"objs": [gdf, gdf.index, gdf.x]}), + ) # Unknown type assert_exceptions_equal( @@ -1997,3 +2005,52 @@ def test_concat_dict_incorrect_type_index(d): match="cannot concatenate a dictionary containing indices", ): cudf.concat(d, axis=1) + + +@pytest.mark.parametrize( + "axis,exception", + [ + (0, nullcontext()), + ( + 1, + pytest.raises( + ValueError, match="cannot concatenate indices across axis 1" + ), + ), + ], +) +@pytest.mark.parametrize( + "idx", + [ + [(cudf.Index, {"data": [1, 2, 3]})], + [(cudf.Index, {"data": [1, 2, 3]}), (cudf.Index, {"data": [4, 5, 6]})], + [ + ( + cudf.MultiIndex, + { + "levels": [[1, 2], ["blue", "red"]], + "codes": [[0, 0, 1, 1], [1, 0, 1, 0]], + }, + ) + ], + [(cudf.CategoricalIndex, {"data": [1, 2, 3]})], + [ + (cudf.RangeIndex, {"start": 2, "stop": 4, "step": 1}), + (cudf.RangeIndex, {"start": 2, "stop": 9, "step": 3}), + ], + ], +) +def test_concat_index(idx, axis, exception): + idx = [c(**d) for c, d in idx] + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + with exception as e: + result = cudf.concat(idx, axis=axis) + if not e: + assert isinstance(result, cudf.Index) + with pytest.raises( + TypeError, match="only Series and DataFrame objs are valid" + ): + pd.concat([i.to_pandas() for i in idx], axis=axis) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 8e7532d044d..8947df6d035 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2283,7 +2283,11 @@ def test_get_indexer_invalid(idx1, idx2): def test_range_index_concat(objs): cudf_objs = [cudf.from_pandas(obj) for obj in objs] - actual = cudf.concat(cudf_objs) + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + actual = cudf.concat(cudf_objs) expected = objs[0] for obj in objs[1:]: diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 58d28f0597e..164f40b86e0 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -276,7 +276,11 @@ def test_categorical_categories(): def test_categorical_as_known(): df = dask_cudf.from_cudf(DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2) df["col_1"] = df["col_1"].astype("category") - actual = df["col_1"].cat.as_known() + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + actual = df["col_1"].cat.as_known() pdf = dd.from_pandas(pd.DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2) pdf["col_1"] = pdf["col_1"].astype("category") diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 981c2c369f1..5692877ddf3 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -13,6 +13,7 @@ from dask.utils import M import cudf +from cudf import BaseIndex import dask_cudf from dask_cudf.tests.utils import skip_dask_expr, xfail_dask_expr @@ -148,7 +149,11 @@ def test_from_pandas_with_generic_idx(): ddf = dask_cudf.from_cudf(cdf, npartitions=2) - assert isinstance(ddf.index.compute(), cudf.RangeIndex) + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + assert isinstance(ddf.index.compute(), cudf.RangeIndex) dd.assert_eq(ddf.loc[1:2, ["a"]], cdf.loc[1:2, ["a"]]) @@ -610,7 +615,14 @@ def test_unary_ops(func, gdf, gddf): p = func(gdf) g = func(gddf) - dd.assert_eq(p, g, check_names=False) + if isinstance(p, BaseIndex): + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + dd.assert_eq(p, g, check_names=False) + else: + dd.assert_eq(p, g, check_names=False) @pytest.mark.parametrize("series", [True, False])