Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Optimize cudf.concat for axis=0 #9222

Merged
merged 2 commits into from
Sep 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1058,6 +1058,10 @@ def _concat(
# the number of empty input frames
num_empty_input_frames = 0

# flag to indicate if all DataFrame's have
# RangeIndex as their index
are_all_range_index = False

for i, obj in enumerate(objs):
# shallow-copy the input DFs in case the same DF instance
# is concatenated with itself
Expand All @@ -1076,6 +1080,10 @@ def _concat(
result_index_length += len(obj)
empty_has_index = empty_has_index or len(obj) > 0

are_all_range_index = (
True if i == 0 else are_all_range_index
) and isinstance(obj.index, cudf.RangeIndex)

if join == "inner":
sets_of_column_names = [set(obj._column_names) for obj in objs]

Expand Down Expand Up @@ -1150,7 +1158,8 @@ def _concat(
columns = [
(
[]
if (ignore_index and not empty_has_index)
if are_all_range_index
or (ignore_index and not empty_has_index)
else list(f._index._data.columns)
)
+ [f._data[name] if name in f._data else None for name in names]
Expand Down Expand Up @@ -1205,14 +1214,21 @@ def _concat(

# Concatenate the Tables
out = cls._from_data(
*libcudf.concat.concat_tables(tables, ignore_index)
*libcudf.concat.concat_tables(
tables, ignore_index=ignore_index or are_all_range_index
)
)

# If ignore_index is True, all input frames are empty, and at
# least one input frame has an index, assign a new RangeIndex
# to the result frame.
if empty_has_index and num_empty_input_frames == len(objs):
out._index = cudf.RangeIndex(result_index_length)
elif are_all_range_index and not ignore_index:
out._index = cudf.core.index.GenericIndex._concat(
[o._index for o in objs]
)
Comment on lines +1227 to +1230
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this case not included in line 1218?

Copy link
Contributor Author

@galipremsagar galipremsagar Sep 13, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, that line's expectation is to have the index columns materialized. Whereas we don't want to materialize and hit the specialized concat rangeindex logic already present in index.py:

https://github.com/rapidsai/cudf/blob/branch-21.10/python/cudf/cudf/core/index.py#L688-L702

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Especially we want to hit _concat_range_index in this case:

def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex:


# Reassign the categories for any categorical table cols
_reassign_categories(
categories, out._data, indices[first_data_column_position:]
Expand Down
83 changes: 63 additions & 20 deletions python/cudf/cudf/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,25 @@ def test_concat_dataframe(index, nulls, axis):
# DataFrame
res = gd.concat([gdf, gdf2, gdf, gdf_empty1], axis=axis).to_pandas()
sol = pd.concat([df, df2, df, df_empty1], axis=axis)
assert_eq(res, sol, check_names=False, check_categorical=False)
assert_eq(
res,
sol,
check_names=False,
check_categorical=False,
check_index_type=True,
)

# Series
for c in [i for i in ("x", "y", "z") if i != index]:
res = gd.concat([gdf[c], gdf2[c], gdf[c]], axis=axis).to_pandas()
sol = pd.concat([df[c], df2[c], df[c]], axis=axis)
assert_eq(res, sol, check_names=False, check_categorical=False)
assert_eq(
res,
sol,
check_names=False,
check_categorical=False,
check_index_type=True,
)

# Index
res = gd.concat([gdf.index, gdf2.index], axis=axis).to_pandas()
Expand All @@ -91,7 +103,13 @@ def test_concat_all_nulls(values):
gb = gd.Series([None])
gs = gd.concat([ga, gb])

assert_eq(ps, gs, check_dtype=False, check_categorical=False)
assert_eq(
ps,
gs,
check_dtype=False,
check_categorical=False,
check_index_type=True,
)


def test_concat_errors():
Expand Down Expand Up @@ -167,7 +185,13 @@ def test_concat_misordered_columns():
res = gd.concat([gdf, gdf2]).to_pandas()
sol = pd.concat([df, df2], sort=False)

assert_eq(res, sol, check_names=False, check_categorical=False)
assert_eq(
res,
sol,
check_names=False,
check_categorical=False,
check_index_type=True,
)


@pytest.mark.parametrize("axis", [1, "columns"])
Expand All @@ -182,7 +206,7 @@ def test_concat_columns(axis):
expect = pd.concat([pdf1, pdf2], axis=axis)
got = gd.concat([gdf1, gdf2], axis=axis)

assert_eq(expect, got)
assert_eq(expect, got, check_index_type=True)


def test_concat_multiindex_dataframe():
Expand All @@ -201,7 +225,9 @@ def test_concat_multiindex_dataframe():
gdg1 = gd.from_pandas(pdg1)
gdg2 = gd.from_pandas(pdg2)
assert_eq(
gd.concat([gdg1, gdg2]).astype("float64"), pd.concat([pdg1, pdg2])
gd.concat([gdg1, gdg2]).astype("float64"),
pd.concat([pdg1, pdg2]),
check_index_type=True,
)
assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1))

Expand All @@ -221,7 +247,9 @@ def test_concat_multiindex_series():
pdg2 = pdg["z"]
gdg1 = gd.from_pandas(pdg1)
gdg2 = gd.from_pandas(pdg2)
assert_eq(gd.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2]))
assert_eq(
gd.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2]), check_index_type=True
)
assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1))


Expand Down Expand Up @@ -363,10 +391,19 @@ def test_concat_mixed_input():
assert_eq(
pd.concat([pdf1, None, pdf2, None]),
gd.concat([gdf1, None, gdf2, None]),
check_index_type=True,
)
assert_eq(
pd.concat([pdf1, None]), gd.concat([gdf1, None]), check_index_type=True
)
assert_eq(
pd.concat([None, pdf2]), gd.concat([None, gdf2]), check_index_type=True
)
assert_eq(
pd.concat([None, pdf2, pdf1]),
gd.concat([None, gdf2, gdf1]),
check_index_type=True,
)
assert_eq(pd.concat([pdf1, None]), gd.concat([gdf1, None]))
assert_eq(pd.concat([None, pdf2]), gd.concat([None, gdf2]))
assert_eq(pd.concat([None, pdf2, pdf1]), gd.concat([None, gdf2, gdf1]))


@pytest.mark.parametrize(
Expand Down Expand Up @@ -540,7 +577,7 @@ def test_concat_empty_dataframes(df, other, ignore_index):
else:
expected[key] = expected[key].fillna(-1)
actual[key] = col.fillna(-1)
assert_eq(expected, actual, check_dtype=False)
assert_eq(expected, actual, check_dtype=False, check_index_type=True)
else:
assert_eq(
expected, actual, check_index_type=False if gdf.empty else True
Expand All @@ -564,7 +601,7 @@ def test_concat_empty_and_nonempty_series(ignore_index, data, axis):
got = gd.concat([s1, s2], axis=axis, ignore_index=ignore_index)
expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index)

assert_eq(got, expect)
assert_eq(got, expect, check_index_type=True)


@pytest.mark.parametrize("ignore_index", [True, False])
Expand All @@ -577,7 +614,7 @@ def test_concat_two_empty_series(ignore_index, axis):
got = gd.concat([s1, s2], axis=axis, ignore_index=ignore_index)
expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index)

assert_eq(got, expect)
assert_eq(got, expect, check_index_type=True)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -670,6 +707,7 @@ def test_concat_join(objs, ignore_index, sort, join, axis):
ignore_index=ignore_index,
axis=axis,
),
check_index_type=True,
)


Expand Down Expand Up @@ -1247,6 +1285,7 @@ def test_concat_preserve_order():
assert_eq(
pd.concat(dfs, join="inner"),
gd.concat([gd.DataFrame(df) for df in dfs], join="inner"),
check_index_type=True,
)


Expand All @@ -1255,7 +1294,11 @@ def test_concat_preserve_order():
def test_concat_single_object(ignore_index, typ):
"""Ensure that concat on a single object does not change it."""
obj = typ([1, 2, 3])
assert_eq(gd.concat([obj], ignore_index=ignore_index, axis=0), obj)
assert_eq(
gd.concat([obj], ignore_index=ignore_index, axis=0),
obj,
check_index_type=True,
)


@pytest.mark.parametrize("ltype", [Decimal64Dtype(3, 1), Decimal64Dtype(7, 2)])
Expand All @@ -1277,7 +1320,7 @@ def test_concat_decimal_dataframe(ltype, rtype):
got = gd.concat([gdf1, gdf2])
expected = pd.concat([pdf1, pdf2])

assert_eq(expected, got)
assert_eq(expected, got, check_index_type=True)


@pytest.mark.parametrize("ltype", [Decimal64Dtype(4, 1), Decimal64Dtype(8, 2)])
Expand All @@ -1294,7 +1337,7 @@ def test_concat_decimal_series(ltype, rtype):
got = gd.concat([gs1, gs2])
expected = pd.concat([ps1, ps2])

assert_eq(expected, got)
assert_eq(expected, got, check_index_type=True)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -1395,7 +1438,7 @@ def test_concat_decimal_series(ltype, rtype):
)
def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected):
df = gd.concat([df1, df2, df3])
assert_eq(df, expected)
assert_eq(df, expected, check_index_type=True)
assert_eq(df.val.dtype, expected.val.dtype)


Expand Down Expand Up @@ -1487,7 +1530,7 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected):
)
def test_concat_decimal_numeric_series(s1, s2, s3, expected):
s = gd.concat([s1, s2, s3])
assert_eq(s, expected)
assert_eq(s, expected, check_index_type=True)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -1558,7 +1601,7 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected):
)
def test_concat_decimal_non_numeric(s1, s2, expected):
s = gd.concat([s1, s2])
assert_eq(s, expected)
assert_eq(s, expected, check_index_type=True)


@pytest.mark.parametrize(
Expand All @@ -1581,4 +1624,4 @@ def test_concat_decimal_non_numeric(s1, s2, expected):
)
def test_concat_struct_column(s1, s2, expected):
s = gd.concat([s1, s2])
assert_eq(s, expected)
assert_eq(s, expected, check_index_type=True)
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1487,7 +1487,7 @@ def test_csv_writer_file_append(tmpdir):

result = cudf.read_csv(gdf_df_fname)
expected = cudf.concat([gdf1, gdf2], ignore_index=True)
assert_eq(result, expected)
assert_eq(result, expected, check_index_type=True)


def test_csv_writer_buffer(tmpdir):
Expand Down
Loading