From f7c15bbffe7f50f22341bbe48bb4ae609402df5f Mon Sep 17 00:00:00 2001 From: er-eis Date: Sat, 4 May 2024 00:34:47 -0400 Subject: [PATCH 01/19] More explicit index checking --- python/cudf/cudf/core/reshape.py | 2 ++ python/cudf/cudf/tests/test_concat.py | 33 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 26d91bed173..d3bc847fdd3 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -291,6 +291,8 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): raise TypeError( "when concatenating indices you must provide ONLY indices" ) + if axis == 1: + raise ValueError("cannot concatenate indices across axis 1") only_series = all(isinstance(o, cudf.Series) for o in objs) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 4b43a33c8c8..8908a1dc984 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -1997,3 +1997,36 @@ def test_concat_dict_incorrect_type_index(d): match="cannot concatenate a dictionary containing indices", ): cudf.concat(d, axis=1) + + +@pytest.mark.parametrize( + "axis", + [0, 1], +) +@pytest.mark.parametrize( + "idx", + [ + [cudf.Index([1, 2, 3])], + [cudf.Index([1, 2, 3]), cudf.Index([4, 5, 6])], + [ + cudf.MultiIndex( + levels=[[1, 2], ["blue", "red"]], + codes=[[0, 0, 1, 1], [1, 0, 1, 0]], + ) + ], + [cudf.CategoricalIndex([1, 2, 3])], + ], +) +def test_concat_index(idx, axis): + if axis == 1: + with pytest.raises( + ValueError, match="cannot concatenate indices across axis 1" + ): + cudf.concat(idx, axis=axis) + else: + result = cudf.concat(idx, axis=axis) + assert isinstance(result, cudf.Index) + with pytest.raises( + TypeError, match="only Series and DataFrame objs are valid" + ): + pd.concat([i.to_pandas() for i in idx], axis=axis) From e72469636e03343d9c1b8e73bd37ec85452aaf9d Mon Sep 17 00:00:00 2001 From: er-eis Date: Sat, 4 May 2024 00:43:40 -0400 Subject: [PATCH 02/19] Simplify test --- python/cudf/cudf/tests/test_concat.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 8908a1dc984..fc7969ed56d 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -2001,7 +2001,15 @@ def test_concat_dict_incorrect_type_index(d): @pytest.mark.parametrize( "axis", - [0, 1], + [ + 0, + pytest.param( + 1, + marks=pytest.mark.xfail( + reason=("cannot concatenate indices across axis 1") + ), + ), + ], ) @pytest.mark.parametrize( "idx", @@ -2018,14 +2026,8 @@ def test_concat_dict_incorrect_type_index(d): ], ) def test_concat_index(idx, axis): - if axis == 1: - with pytest.raises( - ValueError, match="cannot concatenate indices across axis 1" - ): - cudf.concat(idx, axis=axis) - else: - result = cudf.concat(idx, axis=axis) - assert isinstance(result, cudf.Index) + result = cudf.concat(idx, axis=axis) + assert isinstance(result, cudf.Index) with pytest.raises( TypeError, match="only Series and DataFrame objs are valid" ): From 56390abbaa907d8c1ee65644128626fd41158075 Mon Sep 17 00:00:00 2001 From: er-eis Date: Tue, 7 May 2024 13:05:48 -0400 Subject: [PATCH 03/19] Add deprecation warning --- python/cudf/cudf/core/reshape.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index d3bc847fdd3..c872ee90e23 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -287,6 +287,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): ) if any(isinstance(o, cudf.BaseIndex) for o in objs): + warnings.warn( + "index concatenation will be deprecated in a future release", + FutureWarning, + ) if not all(isinstance(o, cudf.BaseIndex) for o in objs): raise TypeError( "when concatenating indices you must provide ONLY indices" From ede2f641a6ec66f2a16a30a1816d3677fd9f00d9 Mon Sep 17 00:00:00 2001 From: er-eis Date: Tue, 7 May 2024 13:23:45 -0400 Subject: [PATCH 04/19] Pytest warns --- python/cudf/cudf/tests/test_concat.py | 46 +++++++++++++++++---------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index fc7969ed56d..371d756a80e 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -1,7 +1,7 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. import warnings -from contextlib import contextmanager +from contextlib import contextmanager, nullcontext from decimal import Decimal import numpy as np @@ -104,7 +104,11 @@ def test_concat_dataframe(index, nulls, axis): ) # Index - res = cudf.concat([gdf.index, gdf2.index], axis=axis).to_pandas() + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + res = cudf.concat([gdf.index, gdf2.index], axis=axis).to_pandas() sol = df.index.append(df2.index) assert_eq(res, sol, check_names=False, check_categorical=False) @@ -151,12 +155,16 @@ def test_concat_errors(): ) # Mismatched types - assert_exceptions_equal( - lfunc=pd.concat, - rfunc=cudf.concat, - lfunc_args_and_kwargs=([], {"objs": [df, df.index, df.x]}), - rfunc_args_and_kwargs=([], {"objs": [gdf, gdf.index, gdf.x]}), - ) + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + assert_exceptions_equal( + lfunc=pd.concat, + rfunc=cudf.concat, + lfunc_args_and_kwargs=([], {"objs": [df, df.index, df.x]}), + rfunc_args_and_kwargs=([], {"objs": [gdf, gdf.index, gdf.x]}), + ) # Unknown type assert_exceptions_equal( @@ -2000,13 +2008,13 @@ def test_concat_dict_incorrect_type_index(d): @pytest.mark.parametrize( - "axis", + "axis,exception", [ - 0, - pytest.param( + (0, nullcontext()), + ( 1, - marks=pytest.mark.xfail( - reason=("cannot concatenate indices across axis 1") + pytest.raises( + ValueError, match="cannot concatenate indices across axis 1" ), ), ], @@ -2025,9 +2033,15 @@ def test_concat_dict_incorrect_type_index(d): [cudf.CategoricalIndex([1, 2, 3])], ], ) -def test_concat_index(idx, axis): - result = cudf.concat(idx, axis=axis) - assert isinstance(result, cudf.Index) +def test_concat_index(idx, axis, exception): + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + with exception as e: + result = cudf.concat(idx, axis=axis) + if not e: + assert isinstance(result, cudf.Index) with pytest.raises( TypeError, match="only Series and DataFrame objs are valid" ): From eb71a00e2c6270eab4811b0a9ce31a230328ce2d Mon Sep 17 00:00:00 2001 From: er-eis Date: Tue, 7 May 2024 13:31:13 -0400 Subject: [PATCH 05/19] Pytest warns --- python/cudf/cudf/tests/test_index.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 104a5fc0ffa..e20de9b7d69 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2268,7 +2268,11 @@ def test_get_indexer_invalid(idx1, idx2): def test_range_index_concat(objs): cudf_objs = [cudf.from_pandas(obj) for obj in objs] - actual = cudf.concat(cudf_objs) + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + actual = cudf.concat(cudf_objs) expected = objs[0] for obj in objs[1:]: From 438d6f4b74acf36e0cd8008e6c3d344c064c3b74 Mon Sep 17 00:00:00 2001 From: er-eis Date: Tue, 7 May 2024 13:34:28 -0400 Subject: [PATCH 06/19] Pytest warns --- python/cudf/cudf/io/parquet.py | 17 +++++++++++------ python/cudf/cudf/tests/test_parquet.py | 12 ++++++++++-- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index e7f1ad0751f..4d38334dbd3 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -15,6 +15,7 @@ import numpy as np import pandas as pd +import pytest from pyarrow import dataset as ds import cudf @@ -856,12 +857,16 @@ def _read_parquet( "cudf engine doesn't support the " f"following positional arguments: {list(args)}" ) - return libparquet.read_parquet( - filepaths_or_buffers, - columns=columns, - row_groups=row_groups, - use_pandas_metadata=use_pandas_metadata, - ) + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + return libparquet.read_parquet( + filepaths_or_buffers, + columns=columns, + row_groups=row_groups, + use_pandas_metadata=use_pandas_metadata, + ) else: if ( isinstance(filepaths_or_buffers, list) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 6fb1d3d8ba5..b691e34807c 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -553,7 +553,11 @@ def test_parquet_read_filtered_complex_predicate( df.to_parquet(fname, row_group_size=2) # Check filters - df_filtered = cudf.read_parquet(fname, filters=predicate) + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + df_filtered = cudf.read_parquet(fname, filters=predicate) assert_eq(cudf.io.read_parquet_metadata(fname)[1], 10 / 2) assert_eq(len(df_filtered), expected_len) @@ -2155,7 +2159,11 @@ def test_read_parquet_partitioned_filtered( row_groups = None # Filter on partitioned columns - expect = pd.read_parquet(read_path, filters=pfilters) + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + expect = pd.read_parquet(read_path, filters=pfilters) got = cudf.read_parquet( read_path, filters=pfilters, From 95422b6bc47002fa998c9f5c2a6c00b77d102318 Mon Sep 17 00:00:00 2001 From: er-eis Date: Tue, 7 May 2024 13:48:23 -0400 Subject: [PATCH 07/19] Avoid instantiate class --- python/cudf/cudf/tests/test_concat.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 371d756a80e..8a4b6684518 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -2022,18 +2022,22 @@ def test_concat_dict_incorrect_type_index(d): @pytest.mark.parametrize( "idx", [ - [cudf.Index([1, 2, 3])], - [cudf.Index([1, 2, 3]), cudf.Index([4, 5, 6])], + [(cudf.Index, {"data": [1, 2, 3]})], + [(cudf.Index, {"data": [1, 2, 3]}), (cudf.Index, {"data": [4, 5, 6]})], [ - cudf.MultiIndex( - levels=[[1, 2], ["blue", "red"]], - codes=[[0, 0, 1, 1], [1, 0, 1, 0]], + ( + cudf.MultiIndex, + { + "levels": [[1, 2], ["blue", "red"]], + "codes": [[0, 0, 1, 1], [1, 0, 1, 0]], + }, ) ], - [cudf.CategoricalIndex([1, 2, 3])], + [(cudf.CategoricalIndex, {"data": [1, 2, 3]})], ], ) def test_concat_index(idx, axis, exception): + idx = [c(**d) for c, d in idx] with pytest.warns( FutureWarning, match="index concatenation will be deprecated in a future release", From 94c5c447541eba5b7f217d6dd55be8a5d466b72c Mon Sep 17 00:00:00 2001 From: er-eis Date: Tue, 7 May 2024 13:50:55 -0400 Subject: [PATCH 08/19] Add deprecated docstring --- python/cudf/cudf/core/reshape.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index c872ee90e23..a74ca098eb0 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -123,6 +123,8 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): Parameters ---------- objs : list or dictionary of DataFrame, Series, or Index + deprecated:: 24.06 + concatenating indices is deprecated and will be removed in a future version of cudf. axis : {0/'index', 1/'columns'}, default 0 The axis to concatenate along. `axis=1` must be passed if a dictionary is passed. From dbb96d9c752eaaaa100bc404c742ee5dd5981984 Mon Sep 17 00:00:00 2001 From: er-eis Date: Tue, 7 May 2024 15:42:52 -0400 Subject: [PATCH 09/19] Pytest warns --- python/cudf/cudf/io/parquet.py | 17 ++++------ python/cudf/cudf/tests/test_parquet.py | 47 +++++++++++++++++--------- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 4d38334dbd3..e7f1ad0751f 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -15,7 +15,6 @@ import numpy as np import pandas as pd -import pytest from pyarrow import dataset as ds import cudf @@ -857,16 +856,12 @@ def _read_parquet( "cudf engine doesn't support the " f"following positional arguments: {list(args)}" ) - with pytest.warns( - FutureWarning, - match="index concatenation will be deprecated in a future release", - ): - return libparquet.read_parquet( - filepaths_or_buffers, - columns=columns, - row_groups=row_groups, - use_pandas_metadata=use_pandas_metadata, - ) + return libparquet.read_parquet( + filepaths_or_buffers, + columns=columns, + row_groups=row_groups, + use_pandas_metadata=use_pandas_metadata, + ) else: if ( isinstance(filepaths_or_buffers, list) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 09dab7173a4..f9f999fc5f8 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -511,9 +511,13 @@ def test_parquet_read_filtered_multiple_files(tmpdir): df.to_parquet(fname_2, row_group_size=2) # Check filter - filtered_df = cudf.read_parquet( - [fname_0, fname_1, fname_2], filters=[("x", "==", 2)] - ) + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + filtered_df = cudf.read_parquet( + [fname_0, fname_1, fname_2], filters=[("x", "==", 2)] + ) assert_eq( filtered_df, cudf.DataFrame({"x": [2, 2], "y": list("bc")}, index=[2, 2]), @@ -554,10 +558,13 @@ def test_parquet_read_filtered_complex_predicate( df.to_parquet(fname, row_group_size=2) # Check filters - with pytest.warns( - FutureWarning, - match="index concatenation will be deprecated in a future release", - ): + if expected_len != 0: + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + df_filtered = cudf.read_parquet(fname, filters=predicate) + else: df_filtered = cudf.read_parquet(fname, filters=predicate) assert_eq(cudf.io.read_parquet_metadata(fname)[1], 10 / 2) assert_eq(len(df_filtered), expected_len) @@ -2160,17 +2167,17 @@ def test_read_parquet_partitioned_filtered( row_groups = None # Filter on partitioned columns + expect = pd.read_parquet(read_path, filters=pfilters) with pytest.warns( FutureWarning, match="index concatenation will be deprecated in a future release", ): - expect = pd.read_parquet(read_path, filters=pfilters) - got = cudf.read_parquet( - read_path, - filters=pfilters, - row_groups=row_groups, - categorical_partitions=use_cat, - ) + got = cudf.read_parquet( + read_path, + filters=pfilters, + row_groups=row_groups, + categorical_partitions=use_cat, + ) expect["b"] = expect["b"].astype(str) expect["c"] = expect["c"].astype(int) if use_cat: @@ -2188,12 +2195,20 @@ def test_read_parquet_partitioned_filtered( # Filter on non-partitioned column filters = [("a", "==", 10)] - got = cudf.read_parquet(read_path, filters=filters) + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + got = cudf.read_parquet(read_path, filters=filters) expect = pd.read_parquet(read_path, filters=filters) # Filter on both kinds of columns filters = [[("a", "==", 10)], [("c", "==", 1)]] - got = cudf.read_parquet(read_path, filters=filters) + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + got = cudf.read_parquet(read_path, filters=filters) expect = pd.read_parquet(read_path, filters=filters) # Work-around for pandas bug: From 0b9f04df47db891aef3a00a050ed726079b05ea5 Mon Sep 17 00:00:00 2001 From: er-eis Date: Tue, 7 May 2024 21:28:02 -0400 Subject: [PATCH 10/19] More pytest warns --- .../dask_cudf/dask_cudf/io/tests/test_parquet.py | 12 ++++++++++-- .../dask_cudf/dask_cudf/tests/test_accessor.py | 6 +++++- python/dask_cudf/dask_cudf/tests/test_core.py | 16 ++++++++++++++-- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 6f4737db5be..e4658c75ca8 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -570,7 +570,11 @@ def test_nullable_schema_mismatch(tmpdir): [path0, path1], split_row_groups=2, aggregate_files=True ) expect = pd.read_parquet([path0, path1]) - dd.assert_eq(ddf, expect, check_index=False) + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + dd.assert_eq(ddf, expect, check_index=False) def test_parquet_read_filter_and_project(tmpdir): @@ -597,4 +601,8 @@ def test_parquet_read_filter_and_project(tmpdir): # Check result expected = df[(df.a == 5) & (df.c > 20)][columns].reset_index(drop=True) - dd.assert_eq(got, expected) + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + dd.assert_eq(got, expected) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 035b73094e7..d15d054bafe 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -276,7 +276,11 @@ def test_categorical_categories(): def test_categorical_as_known(): df = dask_cudf.from_cudf(DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2) df["col_1"] = df["col_1"].astype("category") - actual = df["col_1"].cat.as_known() + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + actual = df["col_1"].cat.as_known() pdf = dd.from_pandas(pd.DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2) pdf["col_1"] = pdf["col_1"].astype("category") diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 4878d44d636..f6c8336beb5 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -13,6 +13,7 @@ from dask.utils import M import cudf +from cudf import BaseIndex import dask_cudf from dask_cudf.tests.utils import skip_dask_expr, xfail_dask_expr @@ -148,7 +149,11 @@ def test_from_pandas_with_generic_idx(): ddf = dask_cudf.from_cudf(cdf, npartitions=2) - assert isinstance(ddf.index.compute(), cudf.RangeIndex) + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + assert isinstance(ddf.index.compute(), cudf.RangeIndex) dd.assert_eq(ddf.loc[1:2, ["a"]], cdf.loc[1:2, ["a"]]) @@ -610,7 +615,14 @@ def test_unary_ops(func, gdf, gddf): p = func(gdf) g = func(gddf) - dd.assert_eq(p, g, check_names=False) + if isinstance(p, BaseIndex): + with pytest.warns( + FutureWarning, + match="index concatenation will be deprecated in a future release", + ): + dd.assert_eq(p, g, check_names=False) + else: + dd.assert_eq(p, g, check_names=False) @pytest.mark.parametrize("series", [True, False]) From ed2a5144fa09c408f3d7993b80bc9119a0586431 Mon Sep 17 00:00:00 2001 From: er-eis Date: Fri, 17 May 2024 15:01:40 -0400 Subject: [PATCH 11/19] Resolve parquet index concat? --- python/cudf/cudf/_lib/parquet.pyx | 22 ++++++++++++++++------ python/cudf/cudf/tests/test_concat.py | 4 ++++ python/cudf/cudf/tests/test_parquet.py | 10 +++------- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 70acb7f917b..c911773472b 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -280,15 +280,25 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, for rg in row_groups[i]: filtered_idx.append( - cudf.RangeIndex( - start=row_groups_i[rg][0], - stop=row_groups_i[rg][1], - step=range_index_meta['step'] - ) + { + "start": row_groups_i[rg][0], + "stop": row_groups_i[rg][1], + "step": range_index_meta['step'] + } ) if len(filtered_idx) > 0: - idx = cudf.concat(filtered_idx) + idx = cudf.Index( + "data": [ + n + for f_idx in filtered_idx + for n in range( + f_idx["start"], + f_idx["stop"], + f_idx["step"] + ) + ] + ) else: idx = cudf.Index(cudf.core.column.column_empty(0)) else: diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 8a4b6684518..fdfb18cafb2 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -2034,6 +2034,10 @@ def test_concat_dict_incorrect_type_index(d): ) ], [(cudf.CategoricalIndex, {"data": [1, 2, 3]})], + [ + (cudf.RangeIndex, {"start": 2, "stop": 4, "step": 1}), + (cudf.RangeIndex, {"start": 2, "stop": 9, "step": 3}), + ], ], ) def test_concat_index(idx, axis, exception): diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 4f05c38f235..c154d670915 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -509,13 +509,9 @@ def test_parquet_read_filtered_multiple_files(tmpdir): df.to_parquet(fname_2, row_group_size=2) # Check filter - with pytest.warns( - FutureWarning, - match="index concatenation will be deprecated in a future release", - ): - filtered_df = cudf.read_parquet( - [fname_0, fname_1, fname_2], filters=[("x", "==", 2)] - ) + filtered_df = cudf.read_parquet( + [fname_0, fname_1, fname_2], filters=[("x", "==", 2)] + ) assert_eq( filtered_df, cudf.DataFrame({"x": [2, 2], "y": list("bc")}, index=[2, 2]), From 1a95b08c049d3eec937099e9f7487ab4d8a09176 Mon Sep 17 00:00:00 2001 From: er-eis Date: Fri, 17 May 2024 15:04:19 -0400 Subject: [PATCH 12/19] Simplify --- python/cudf/cudf/_lib/parquet.pyx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index c911773472b..35a299af6b5 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -282,8 +282,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, filtered_idx.append( { "start": row_groups_i[rg][0], - "stop": row_groups_i[rg][1], - "step": range_index_meta['step'] + "stop": row_groups_i[rg][1] } ) @@ -295,7 +294,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, for n in range( f_idx["start"], f_idx["stop"], - f_idx["step"] + range_index_meta['step'] ) ] ) From f2f4777bf97a2c84afdab9822907dc5e1dfa3b85 Mon Sep 17 00:00:00 2001 From: er-eis Date: Fri, 17 May 2024 15:16:36 -0400 Subject: [PATCH 13/19] Proper arg --- python/cudf/cudf/_lib/parquet.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 35a299af6b5..cd27569f77c 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -288,7 +288,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, if len(filtered_idx) > 0: idx = cudf.Index( - "data": [ + data=[ n for f_idx in filtered_idx for n in range( From 3bd8ea8b1dbc1cfa3d4bb2b48d2063d4413d616c Mon Sep 17 00:00:00 2001 From: er-eis Date: Fri, 17 May 2024 15:31:47 -0400 Subject: [PATCH 14/19] Parquet tests working --- python/cudf/cudf/tests/test_parquet.py | 34 +++++++------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index c154d670915..43b0b5b7e66 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -553,11 +553,7 @@ def test_parquet_read_filtered_complex_predicate( # Check filters if expected_len != 0: - with pytest.warns( - FutureWarning, - match="index concatenation will be deprecated in a future release", - ): - df_filtered = cudf.read_parquet(fname, filters=predicate) + df_filtered = cudf.read_parquet(fname, filters=predicate) else: df_filtered = cudf.read_parquet(fname, filters=predicate) assert_eq(cudf.io.read_parquet_metadata(fname)[1], 10 / 2) @@ -2208,16 +2204,12 @@ def test_read_parquet_partitioned_filtered( # Filter on partitioned columns expect = pd.read_parquet(read_path, filters=pfilters) - with pytest.warns( - FutureWarning, - match="index concatenation will be deprecated in a future release", - ): - got = cudf.read_parquet( - read_path, - filters=pfilters, - row_groups=row_groups, - categorical_partitions=use_cat, - ) + got = cudf.read_parquet( + read_path, + filters=pfilters, + row_groups=row_groups, + categorical_partitions=use_cat, + ) expect["b"] = expect["b"].astype(str) expect["c"] = expect["c"].astype(int) if use_cat: @@ -2235,20 +2227,12 @@ def test_read_parquet_partitioned_filtered( # Filter on non-partitioned column filters = [("a", "==", 10)] - with pytest.warns( - FutureWarning, - match="index concatenation will be deprecated in a future release", - ): - got = cudf.read_parquet(read_path, filters=filters) + got = cudf.read_parquet(read_path, filters=filters) expect = pd.read_parquet(read_path, filters=filters) # Filter on both kinds of columns filters = [[("a", "==", 10)], [("c", "==", 1)]] - with pytest.warns( - FutureWarning, - match="index concatenation will be deprecated in a future release", - ): - got = cudf.read_parquet(read_path, filters=filters) + got = cudf.read_parquet(read_path, filters=filters) expect = pd.read_parquet(read_path, filters=filters) # Work-around for pandas bug: From 488ab3382ef20f21b061b0053e81344eb1d99fc4 Mon Sep 17 00:00:00 2001 From: er-eis Date: Fri, 17 May 2024 15:32:48 -0400 Subject: [PATCH 15/19] Parquet tests working --- python/cudf/cudf/tests/test_parquet.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 43b0b5b7e66..b2896d55b80 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -552,10 +552,7 @@ def test_parquet_read_filtered_complex_predicate( df.to_parquet(fname, row_group_size=2) # Check filters - if expected_len != 0: - df_filtered = cudf.read_parquet(fname, filters=predicate) - else: - df_filtered = cudf.read_parquet(fname, filters=predicate) + df_filtered = cudf.read_parquet(fname, filters=predicate) assert_eq(cudf.io.read_parquet_metadata(fname)[1], 10 / 2) assert_eq(len(df_filtered), expected_len) From e5901f8dc345f65a9e73aed65801fcc4d55e4d1c Mon Sep 17 00:00:00 2001 From: er-eis Date: Fri, 17 May 2024 15:39:51 -0400 Subject: [PATCH 16/19] Parquet tests working --- python/dask_cudf/dask_cudf/io/tests/test_parquet.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index dc40143e0ef..39800145585 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -568,11 +568,7 @@ def test_nullable_schema_mismatch(tmpdir): [path0, path1], split_row_groups=2, aggregate_files=True ) expect = pd.read_parquet([path0, path1]) - with pytest.warns( - FutureWarning, - match="index concatenation will be deprecated in a future release", - ): - dd.assert_eq(ddf, expect, check_index=False) + dd.assert_eq(ddf, expect, check_index=False) def test_parquet_read_filter_and_project(tmpdir): @@ -599,8 +595,4 @@ def test_parquet_read_filter_and_project(tmpdir): # Check result expected = df[(df.a == 5) & (df.c > 20)][columns].reset_index(drop=True) - with pytest.warns( - FutureWarning, - match="index concatenation will be deprecated in a future release", - ): - dd.assert_eq(got, expected) + dd.assert_eq(got, expected) From 87288b813df3f808fbdbcc2b66837e94242e7d3e Mon Sep 17 00:00:00 2001 From: er-eis Date: Fri, 17 May 2024 15:44:24 -0400 Subject: [PATCH 17/19] Simplify pyx --- python/cudf/cudf/_lib/parquet.pyx | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index cd27569f77c..bad81374add 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -280,23 +280,23 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, for rg in row_groups[i]: filtered_idx.append( - { - "start": row_groups_i[rg][0], - "stop": row_groups_i[rg][1] - } + tuple( + n for n in + range( + row_groups_i[rg][0], + row_groups_i[rg][1], + range_index_meta['step'] + ) + ) ) if len(filtered_idx) > 0: idx = cudf.Index( data=[ n - for f_idx in filtered_idx - for n in range( - f_idx["start"], - f_idx["stop"], - range_index_meta['step'] - ) - ] + for nums in filtered_idx + for n in nums + ] ) else: idx = cudf.Index(cudf.core.column.column_empty(0)) From d38d612981c485a2a6ccf1a45f2408d11f1fb84e Mon Sep 17 00:00:00 2001 From: er-eis Date: Fri, 17 May 2024 15:51:43 -0400 Subject: [PATCH 18/19] Simplify pyx --- python/cudf/cudf/_lib/parquet.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index bad81374add..44c0a5245ce 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -280,14 +280,14 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, for rg in row_groups[i]: filtered_idx.append( - tuple( + [ n for n in range( row_groups_i[rg][0], row_groups_i[rg][1], range_index_meta['step'] ) - ) + ] ) if len(filtered_idx) > 0: From a6164e3624b13c5d68b71d44d657d488d3c35ef4 Mon Sep 17 00:00:00 2001 From: er-eis Date: Sat, 18 May 2024 12:46:11 -0400 Subject: [PATCH 19/19] Account for RangeIndex in pyx --- python/cudf/cudf/_lib/parquet.pyx | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 44c0a5245ce..5d352f2c8eb 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -280,22 +280,28 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, for rg in row_groups[i]: filtered_idx.append( - [ - n for n in - range( - row_groups_i[rg][0], - row_groups_i[rg][1], - range_index_meta['step'] - ) - ] + ( + row_groups_i[rg][0], + row_groups_i[rg][1] + ) ) - if len(filtered_idx) > 0: + step = range_index_meta['step'] + if len(filtered_idx) == 1: + start, stop = filtered_idx[0] + idx = cudf.RangeIndex( + start=start, stop=stop, step=step + ) + elif len(filtered_idx) > 1: idx = cudf.Index( data=[ n - for nums in filtered_idx - for n in nums + for start, stop in filtered_idx + for n in range( + start, + stop, + step + ) ] ) else: