Skip to content

Commit

Permalink
Optimize cudf.concat for axis=0 (#9222)
Browse files Browse the repository at this point in the history
This PR optimizes `cudf.concat` when `axis=0` by not materializing `RangeIndex` objects present as index to the `Dataframe` objects.

Partially addresses #9200, This is 1/2 of full optimizations. A follow-up PR to optimize `axis=1` will be opened as there are multiple large changes.

Here is a benchmark:
On `branch-21.10`:

```ipython
IPython 7.27.0 -- An enhanced Interactive Python. Type '?' for help.

In [1]: import cudf

In [2]: df = cudf.DataFrame({'a':[1, 2, 3]*100})

In [3]: df2 = cudf.DataFrame({'a':[1, 2, 3]*100}, index=cudf.RangeIndex(300, 600))

In [4]: %timeit cudf.concat([df, df2])
806 µs ± 8.02 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
```

This PR:

```ipython
IPython 7.27.0 -- An enhanced Interactive Python. Type '?' for help.

In [1]: import cudf

In [2]: df = cudf.DataFrame({'a':[1, 2, 3]*100})

In [3]: df2 = cudf.DataFrame({'a':[1, 2, 3]*100}, index=cudf.RangeIndex(300, 600))

In [4]: %timeit cudf.concat([df, df2])
434 µs ± 4.35 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
```

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: #9222
  • Loading branch information
galipremsagar authored Sep 14, 2021
1 parent eb09d14 commit cc13060
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 40 deletions.
20 changes: 18 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1058,6 +1058,10 @@ def _concat(
# the number of empty input frames
num_empty_input_frames = 0

# flag to indicate if all DataFrame's have
# RangeIndex as their index
are_all_range_index = False

for i, obj in enumerate(objs):
# shallow-copy the input DFs in case the same DF instance
# is concatenated with itself
Expand All @@ -1076,6 +1080,10 @@ def _concat(
result_index_length += len(obj)
empty_has_index = empty_has_index or len(obj) > 0

are_all_range_index = (
True if i == 0 else are_all_range_index
) and isinstance(obj.index, cudf.RangeIndex)

if join == "inner":
sets_of_column_names = [set(obj._column_names) for obj in objs]

Expand Down Expand Up @@ -1150,7 +1158,8 @@ def _concat(
columns = [
(
[]
if (ignore_index and not empty_has_index)
if are_all_range_index
or (ignore_index and not empty_has_index)
else list(f._index._data.columns)
)
+ [f._data[name] if name in f._data else None for name in names]
Expand Down Expand Up @@ -1205,14 +1214,21 @@ def _concat(

# Concatenate the Tables
out = cls._from_data(
*libcudf.concat.concat_tables(tables, ignore_index)
*libcudf.concat.concat_tables(
tables, ignore_index=ignore_index or are_all_range_index
)
)

# If ignore_index is True, all input frames are empty, and at
# least one input frame has an index, assign a new RangeIndex
# to the result frame.
if empty_has_index and num_empty_input_frames == len(objs):
out._index = cudf.RangeIndex(result_index_length)
elif are_all_range_index and not ignore_index:
out._index = cudf.core.index.GenericIndex._concat(
[o._index for o in objs]
)

# Reassign the categories for any categorical table cols
_reassign_categories(
categories, out._data, indices[first_data_column_position:]
Expand Down
83 changes: 63 additions & 20 deletions python/cudf/cudf/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,25 @@ def test_concat_dataframe(index, nulls, axis):
# DataFrame
res = gd.concat([gdf, gdf2, gdf, gdf_empty1], axis=axis).to_pandas()
sol = pd.concat([df, df2, df, df_empty1], axis=axis)
assert_eq(res, sol, check_names=False, check_categorical=False)
assert_eq(
res,
sol,
check_names=False,
check_categorical=False,
check_index_type=True,
)

# Series
for c in [i for i in ("x", "y", "z") if i != index]:
res = gd.concat([gdf[c], gdf2[c], gdf[c]], axis=axis).to_pandas()
sol = pd.concat([df[c], df2[c], df[c]], axis=axis)
assert_eq(res, sol, check_names=False, check_categorical=False)
assert_eq(
res,
sol,
check_names=False,
check_categorical=False,
check_index_type=True,
)

# Index
res = gd.concat([gdf.index, gdf2.index], axis=axis).to_pandas()
Expand All @@ -91,7 +103,13 @@ def test_concat_all_nulls(values):
gb = gd.Series([None])
gs = gd.concat([ga, gb])

assert_eq(ps, gs, check_dtype=False, check_categorical=False)
assert_eq(
ps,
gs,
check_dtype=False,
check_categorical=False,
check_index_type=True,
)


def test_concat_errors():
Expand Down Expand Up @@ -167,7 +185,13 @@ def test_concat_misordered_columns():
res = gd.concat([gdf, gdf2]).to_pandas()
sol = pd.concat([df, df2], sort=False)

assert_eq(res, sol, check_names=False, check_categorical=False)
assert_eq(
res,
sol,
check_names=False,
check_categorical=False,
check_index_type=True,
)


@pytest.mark.parametrize("axis", [1, "columns"])
Expand All @@ -182,7 +206,7 @@ def test_concat_columns(axis):
expect = pd.concat([pdf1, pdf2], axis=axis)
got = gd.concat([gdf1, gdf2], axis=axis)

assert_eq(expect, got)
assert_eq(expect, got, check_index_type=True)


def test_concat_multiindex_dataframe():
Expand All @@ -201,7 +225,9 @@ def test_concat_multiindex_dataframe():
gdg1 = gd.from_pandas(pdg1)
gdg2 = gd.from_pandas(pdg2)
assert_eq(
gd.concat([gdg1, gdg2]).astype("float64"), pd.concat([pdg1, pdg2])
gd.concat([gdg1, gdg2]).astype("float64"),
pd.concat([pdg1, pdg2]),
check_index_type=True,
)
assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1))

Expand All @@ -221,7 +247,9 @@ def test_concat_multiindex_series():
pdg2 = pdg["z"]
gdg1 = gd.from_pandas(pdg1)
gdg2 = gd.from_pandas(pdg2)
assert_eq(gd.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2]))
assert_eq(
gd.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2]), check_index_type=True
)
assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1))


Expand Down Expand Up @@ -363,10 +391,19 @@ def test_concat_mixed_input():
assert_eq(
pd.concat([pdf1, None, pdf2, None]),
gd.concat([gdf1, None, gdf2, None]),
check_index_type=True,
)
assert_eq(
pd.concat([pdf1, None]), gd.concat([gdf1, None]), check_index_type=True
)
assert_eq(
pd.concat([None, pdf2]), gd.concat([None, gdf2]), check_index_type=True
)
assert_eq(
pd.concat([None, pdf2, pdf1]),
gd.concat([None, gdf2, gdf1]),
check_index_type=True,
)
assert_eq(pd.concat([pdf1, None]), gd.concat([gdf1, None]))
assert_eq(pd.concat([None, pdf2]), gd.concat([None, gdf2]))
assert_eq(pd.concat([None, pdf2, pdf1]), gd.concat([None, gdf2, gdf1]))


@pytest.mark.parametrize(
Expand Down Expand Up @@ -540,7 +577,7 @@ def test_concat_empty_dataframes(df, other, ignore_index):
else:
expected[key] = expected[key].fillna(-1)
actual[key] = col.fillna(-1)
assert_eq(expected, actual, check_dtype=False)
assert_eq(expected, actual, check_dtype=False, check_index_type=True)
else:
assert_eq(
expected, actual, check_index_type=False if gdf.empty else True
Expand All @@ -564,7 +601,7 @@ def test_concat_empty_and_nonempty_series(ignore_index, data, axis):
got = gd.concat([s1, s2], axis=axis, ignore_index=ignore_index)
expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index)

assert_eq(got, expect)
assert_eq(got, expect, check_index_type=True)


@pytest.mark.parametrize("ignore_index", [True, False])
Expand All @@ -577,7 +614,7 @@ def test_concat_two_empty_series(ignore_index, axis):
got = gd.concat([s1, s2], axis=axis, ignore_index=ignore_index)
expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index)

assert_eq(got, expect)
assert_eq(got, expect, check_index_type=True)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -670,6 +707,7 @@ def test_concat_join(objs, ignore_index, sort, join, axis):
ignore_index=ignore_index,
axis=axis,
),
check_index_type=True,
)


Expand Down Expand Up @@ -1247,6 +1285,7 @@ def test_concat_preserve_order():
assert_eq(
pd.concat(dfs, join="inner"),
gd.concat([gd.DataFrame(df) for df in dfs], join="inner"),
check_index_type=True,
)


Expand All @@ -1255,7 +1294,11 @@ def test_concat_preserve_order():
def test_concat_single_object(ignore_index, typ):
"""Ensure that concat on a single object does not change it."""
obj = typ([1, 2, 3])
assert_eq(gd.concat([obj], ignore_index=ignore_index, axis=0), obj)
assert_eq(
gd.concat([obj], ignore_index=ignore_index, axis=0),
obj,
check_index_type=True,
)


@pytest.mark.parametrize("ltype", [Decimal64Dtype(3, 1), Decimal64Dtype(7, 2)])
Expand All @@ -1277,7 +1320,7 @@ def test_concat_decimal_dataframe(ltype, rtype):
got = gd.concat([gdf1, gdf2])
expected = pd.concat([pdf1, pdf2])

assert_eq(expected, got)
assert_eq(expected, got, check_index_type=True)


@pytest.mark.parametrize("ltype", [Decimal64Dtype(4, 1), Decimal64Dtype(8, 2)])
Expand All @@ -1294,7 +1337,7 @@ def test_concat_decimal_series(ltype, rtype):
got = gd.concat([gs1, gs2])
expected = pd.concat([ps1, ps2])

assert_eq(expected, got)
assert_eq(expected, got, check_index_type=True)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -1395,7 +1438,7 @@ def test_concat_decimal_series(ltype, rtype):
)
def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected):
df = gd.concat([df1, df2, df3])
assert_eq(df, expected)
assert_eq(df, expected, check_index_type=True)
assert_eq(df.val.dtype, expected.val.dtype)


Expand Down Expand Up @@ -1487,7 +1530,7 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected):
)
def test_concat_decimal_numeric_series(s1, s2, s3, expected):
s = gd.concat([s1, s2, s3])
assert_eq(s, expected)
assert_eq(s, expected, check_index_type=True)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -1558,7 +1601,7 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected):
)
def test_concat_decimal_non_numeric(s1, s2, expected):
s = gd.concat([s1, s2])
assert_eq(s, expected)
assert_eq(s, expected, check_index_type=True)


@pytest.mark.parametrize(
Expand All @@ -1581,4 +1624,4 @@ def test_concat_decimal_non_numeric(s1, s2, expected):
)
def test_concat_struct_column(s1, s2, expected):
s = gd.concat([s1, s2])
assert_eq(s, expected)
assert_eq(s, expected, check_index_type=True)
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1487,7 +1487,7 @@ def test_csv_writer_file_append(tmpdir):

result = cudf.read_csv(gdf_df_fname)
expected = cudf.concat([gdf1, gdf2], ignore_index=True)
assert_eq(result, expected)
assert_eq(result, expected, check_index_type=True)


def test_csv_writer_buffer(tmpdir):
Expand Down
Loading

0 comments on commit cc13060

Please sign in to comment.