Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More explicit index concat checking #15650

Draft
wants to merge 22 commits into
base: branch-24.06
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -280,15 +280,30 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,

for rg in row_groups[i]:
filtered_idx.append(
cudf.RangeIndex(
start=row_groups_i[rg][0],
stop=row_groups_i[rg][1],
step=range_index_meta['step']
(
row_groups_i[rg][0],
row_groups_i[rg][1]
)
)

if len(filtered_idx) > 0:
idx = cudf.concat(filtered_idx)
step = range_index_meta['step']
if len(filtered_idx) == 1:
start, stop = filtered_idx[0]
idx = cudf.RangeIndex(
start=start, stop=stop, step=step
)
elif len(filtered_idx) > 1:
idx = cudf.Index(
data=[
n
for start, stop in filtered_idx
for n in range(
start,
stop,
step
)
]
)
else:
idx = cudf.Index(cudf.core.column.column_empty(0))
else:
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
Parameters
----------
objs : list or dictionary of DataFrame, Series, or Index
deprecated:: 24.06
concatenating indices is deprecated and will be removed in a future version of cudf.
axis : {0/'index', 1/'columns'}, default 0
The axis to concatenate along.
`axis=1` must be passed if a dictionary is passed.
Expand Down Expand Up @@ -285,10 +287,16 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
)

if any(isinstance(o, cudf.BaseIndex) for o in objs):
warnings.warn(
"index concatenation will be deprecated in a future release",
FutureWarning,
)
if not all(isinstance(o, cudf.BaseIndex) for o in objs):
raise TypeError(
"when concatenating indices you must provide ONLY indices"
)
if axis == 1:
raise ValueError("cannot concatenate indices across axis 1")
er-eis marked this conversation as resolved.
Show resolved Hide resolved

only_series = all(isinstance(o, cudf.Series) for o in objs)

Expand Down
73 changes: 65 additions & 8 deletions python/cudf/cudf/tests/test_concat.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) 2018-2024, NVIDIA CORPORATION.

import warnings
from contextlib import contextmanager
from contextlib import contextmanager, nullcontext
from decimal import Decimal

import numpy as np
Expand Down Expand Up @@ -104,7 +104,11 @@ def test_concat_dataframe(index, nulls, axis):
)

# Index
res = cudf.concat([gdf.index, gdf2.index], axis=axis).to_pandas()
with pytest.warns(
FutureWarning,
match="index concatenation will be deprecated in a future release",
):
res = cudf.concat([gdf.index, gdf2.index], axis=axis).to_pandas()
sol = df.index.append(df2.index)
assert_eq(res, sol, check_names=False, check_categorical=False)

Expand Down Expand Up @@ -151,12 +155,16 @@ def test_concat_errors():
)

# Mismatched types
assert_exceptions_equal(
lfunc=pd.concat,
rfunc=cudf.concat,
lfunc_args_and_kwargs=([], {"objs": [df, df.index, df.x]}),
rfunc_args_and_kwargs=([], {"objs": [gdf, gdf.index, gdf.x]}),
)
with pytest.warns(
FutureWarning,
match="index concatenation will be deprecated in a future release",
):
assert_exceptions_equal(
lfunc=pd.concat,
rfunc=cudf.concat,
lfunc_args_and_kwargs=([], {"objs": [df, df.index, df.x]}),
rfunc_args_and_kwargs=([], {"objs": [gdf, gdf.index, gdf.x]}),
)

# Unknown type
assert_exceptions_equal(
Expand Down Expand Up @@ -1997,3 +2005,52 @@ def test_concat_dict_incorrect_type_index(d):
match="cannot concatenate a dictionary containing indices",
):
cudf.concat(d, axis=1)


@pytest.mark.parametrize(
"axis,exception",
[
(0, nullcontext()),
(
1,
pytest.raises(
ValueError, match="cannot concatenate indices across axis 1"
),
),
],
)
@pytest.mark.parametrize(
"idx",
[
[(cudf.Index, {"data": [1, 2, 3]})],
[(cudf.Index, {"data": [1, 2, 3]}), (cudf.Index, {"data": [4, 5, 6]})],
[
(
cudf.MultiIndex,
{
"levels": [[1, 2], ["blue", "red"]],
"codes": [[0, 0, 1, 1], [1, 0, 1, 0]],
},
)
],
[(cudf.CategoricalIndex, {"data": [1, 2, 3]})],
[
(cudf.RangeIndex, {"start": 2, "stop": 4, "step": 1}),
(cudf.RangeIndex, {"start": 2, "stop": 9, "step": 3}),
],
],
)
def test_concat_index(idx, axis, exception):
idx = [c(**d) for c, d in idx]
with pytest.warns(
FutureWarning,
match="index concatenation will be deprecated in a future release",
):
with exception as e:
result = cudf.concat(idx, axis=axis)
if not e:
assert isinstance(result, cudf.Index)
with pytest.raises(
TypeError, match="only Series and DataFrame objs are valid"
):
pd.concat([i.to_pandas() for i in idx], axis=axis)
6 changes: 5 additions & 1 deletion python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2283,7 +2283,11 @@ def test_get_indexer_invalid(idx1, idx2):
def test_range_index_concat(objs):
cudf_objs = [cudf.from_pandas(obj) for obj in objs]

actual = cudf.concat(cudf_objs)
with pytest.warns(
FutureWarning,
match="index concatenation will be deprecated in a future release",
):
actual = cudf.concat(cudf_objs)

expected = objs[0]
for obj in objs[1:]:
Expand Down
6 changes: 5 additions & 1 deletion python/dask_cudf/dask_cudf/tests/test_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,11 @@ def test_categorical_categories():
def test_categorical_as_known():
df = dask_cudf.from_cudf(DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2)
df["col_1"] = df["col_1"].astype("category")
actual = df["col_1"].cat.as_known()
with pytest.warns(
FutureWarning,
match="index concatenation will be deprecated in a future release",
):
actual = df["col_1"].cat.as_known()

pdf = dd.from_pandas(pd.DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2)
pdf["col_1"] = pdf["col_1"].astype("category")
Expand Down
16 changes: 14 additions & 2 deletions python/dask_cudf/dask_cudf/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from dask.utils import M

import cudf
from cudf import BaseIndex

import dask_cudf
from dask_cudf.tests.utils import skip_dask_expr, xfail_dask_expr
Expand Down Expand Up @@ -148,7 +149,11 @@ def test_from_pandas_with_generic_idx():

ddf = dask_cudf.from_cudf(cdf, npartitions=2)

assert isinstance(ddf.index.compute(), cudf.RangeIndex)
with pytest.warns(
FutureWarning,
match="index concatenation will be deprecated in a future release",
):
assert isinstance(ddf.index.compute(), cudf.RangeIndex)
dd.assert_eq(ddf.loc[1:2, ["a"]], cdf.loc[1:2, ["a"]])


Expand Down Expand Up @@ -610,7 +615,14 @@ def test_unary_ops(func, gdf, gddf):
p = func(gdf)
g = func(gddf)

dd.assert_eq(p, g, check_names=False)
if isinstance(p, BaseIndex):
with pytest.warns(
FutureWarning,
match="index concatenation will be deprecated in a future release",
):
dd.assert_eq(p, g, check_names=False)
else:
dd.assert_eq(p, g, check_names=False)


@pytest.mark.parametrize("series", [True, False])
Expand Down
Loading