From 08a816c7b1398c9d3c21a97baa668aecaad02021 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 13 Sep 2021 13:06:35 -0700 Subject: [PATCH 1/2] optimize concat --- python/cudf/cudf/core/dataframe.py | 21 +++++- python/cudf/cudf/tests/test_concat.py | 83 ++++++++++++++++++------ python/cudf/cudf/tests/test_csv.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 54 +++++++++++---- python/cudf/cudf/tests/test_orc.py | 6 +- python/cudf/cudf/tests/test_query.py | 2 +- 6 files changed, 128 insertions(+), 40 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4239a55118f..4ab9879e9ee 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1058,6 +1058,10 @@ def _concat( # the number of empty input frames num_empty_input_frames = 0 + # flag to indicate if all DataFrame's have + # RangeIndex as their index + are_all_range_index = False + for i, obj in enumerate(objs): # shallow-copy the input DFs in case the same DF instance # is concatenated with itself @@ -1076,6 +1080,10 @@ def _concat( result_index_length += len(obj) empty_has_index = empty_has_index or len(obj) > 0 + are_all_range_index = ( + True if i == 0 else are_all_range_index + ) and isinstance(obj.index, cudf.RangeIndex) + if join == "inner": sets_of_column_names = [set(obj._column_names) for obj in objs] @@ -1150,12 +1158,14 @@ def _concat( columns = [ ( [] - if (ignore_index and not empty_has_index) + if are_all_range_index + or (ignore_index and not empty_has_index) else list(f._index._data.columns) ) + [f._data[name] if name in f._data else None for name in names] for f in objs ] + # import pdb;pdb.set_trace() # Get a list of the combined index and table column indices indices = list(range(functools.reduce(max, map(len, columns)))) @@ -1205,7 +1215,9 @@ def _concat( # Concatenate the Tables out = cls._from_data( - *libcudf.concat.concat_tables(tables, ignore_index) + *libcudf.concat.concat_tables( + tables, ignore_index=ignore_index or are_all_range_index + ) ) # If ignore_index is True, all input frames are empty, and at @@ -1213,6 +1225,11 @@ def _concat( # to the result frame. if empty_has_index and num_empty_input_frames == len(objs): out._index = cudf.RangeIndex(result_index_length) + elif are_all_range_index and not ignore_index: + out._index = cudf.core.index.GenericIndex._concat( + [o._index for o in objs] + ) + # Reassign the categories for any categorical table cols _reassign_categories( categories, out._data, indices[first_data_column_position:] diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 3983e8a5f4a..5f2273156ed 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -64,13 +64,25 @@ def test_concat_dataframe(index, nulls, axis): # DataFrame res = gd.concat([gdf, gdf2, gdf, gdf_empty1], axis=axis).to_pandas() sol = pd.concat([df, df2, df, df_empty1], axis=axis) - assert_eq(res, sol, check_names=False, check_categorical=False) + assert_eq( + res, + sol, + check_names=False, + check_categorical=False, + check_index_type=True, + ) # Series for c in [i for i in ("x", "y", "z") if i != index]: res = gd.concat([gdf[c], gdf2[c], gdf[c]], axis=axis).to_pandas() sol = pd.concat([df[c], df2[c], df[c]], axis=axis) - assert_eq(res, sol, check_names=False, check_categorical=False) + assert_eq( + res, + sol, + check_names=False, + check_categorical=False, + check_index_type=True, + ) # Index res = gd.concat([gdf.index, gdf2.index], axis=axis).to_pandas() @@ -91,7 +103,13 @@ def test_concat_all_nulls(values): gb = gd.Series([None]) gs = gd.concat([ga, gb]) - assert_eq(ps, gs, check_dtype=False, check_categorical=False) + assert_eq( + ps, + gs, + check_dtype=False, + check_categorical=False, + check_index_type=True, + ) def test_concat_errors(): @@ -167,7 +185,13 @@ def test_concat_misordered_columns(): res = gd.concat([gdf, gdf2]).to_pandas() sol = pd.concat([df, df2], sort=False) - assert_eq(res, sol, check_names=False, check_categorical=False) + assert_eq( + res, + sol, + check_names=False, + check_categorical=False, + check_index_type=True, + ) @pytest.mark.parametrize("axis", [1, "columns"]) @@ -182,7 +206,7 @@ def test_concat_columns(axis): expect = pd.concat([pdf1, pdf2], axis=axis) got = gd.concat([gdf1, gdf2], axis=axis) - assert_eq(expect, got) + assert_eq(expect, got, check_index_type=True) def test_concat_multiindex_dataframe(): @@ -201,7 +225,9 @@ def test_concat_multiindex_dataframe(): gdg1 = gd.from_pandas(pdg1) gdg2 = gd.from_pandas(pdg2) assert_eq( - gd.concat([gdg1, gdg2]).astype("float64"), pd.concat([pdg1, pdg2]) + gd.concat([gdg1, gdg2]).astype("float64"), + pd.concat([pdg1, pdg2]), + check_index_type=True, ) assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1)) @@ -221,7 +247,9 @@ def test_concat_multiindex_series(): pdg2 = pdg["z"] gdg1 = gd.from_pandas(pdg1) gdg2 = gd.from_pandas(pdg2) - assert_eq(gd.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2])) + assert_eq( + gd.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2]), check_index_type=True + ) assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1)) @@ -363,10 +391,19 @@ def test_concat_mixed_input(): assert_eq( pd.concat([pdf1, None, pdf2, None]), gd.concat([gdf1, None, gdf2, None]), + check_index_type=True, + ) + assert_eq( + pd.concat([pdf1, None]), gd.concat([gdf1, None]), check_index_type=True + ) + assert_eq( + pd.concat([None, pdf2]), gd.concat([None, gdf2]), check_index_type=True + ) + assert_eq( + pd.concat([None, pdf2, pdf1]), + gd.concat([None, gdf2, gdf1]), + check_index_type=True, ) - assert_eq(pd.concat([pdf1, None]), gd.concat([gdf1, None])) - assert_eq(pd.concat([None, pdf2]), gd.concat([None, gdf2])) - assert_eq(pd.concat([None, pdf2, pdf1]), gd.concat([None, gdf2, gdf1])) @pytest.mark.parametrize( @@ -540,7 +577,7 @@ def test_concat_empty_dataframes(df, other, ignore_index): else: expected[key] = expected[key].fillna(-1) actual[key] = col.fillna(-1) - assert_eq(expected, actual, check_dtype=False) + assert_eq(expected, actual, check_dtype=False, check_index_type=True) else: assert_eq( expected, actual, check_index_type=False if gdf.empty else True @@ -564,7 +601,7 @@ def test_concat_empty_and_nonempty_series(ignore_index, data, axis): got = gd.concat([s1, s2], axis=axis, ignore_index=ignore_index) expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index) - assert_eq(got, expect) + assert_eq(got, expect, check_index_type=True) @pytest.mark.parametrize("ignore_index", [True, False]) @@ -577,7 +614,7 @@ def test_concat_two_empty_series(ignore_index, axis): got = gd.concat([s1, s2], axis=axis, ignore_index=ignore_index) expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index) - assert_eq(got, expect) + assert_eq(got, expect, check_index_type=True) @pytest.mark.parametrize( @@ -670,6 +707,7 @@ def test_concat_join(objs, ignore_index, sort, join, axis): ignore_index=ignore_index, axis=axis, ), + check_index_type=True, ) @@ -1247,6 +1285,7 @@ def test_concat_preserve_order(): assert_eq( pd.concat(dfs, join="inner"), gd.concat([gd.DataFrame(df) for df in dfs], join="inner"), + check_index_type=True, ) @@ -1255,7 +1294,11 @@ def test_concat_preserve_order(): def test_concat_single_object(ignore_index, typ): """Ensure that concat on a single object does not change it.""" obj = typ([1, 2, 3]) - assert_eq(gd.concat([obj], ignore_index=ignore_index, axis=0), obj) + assert_eq( + gd.concat([obj], ignore_index=ignore_index, axis=0), + obj, + check_index_type=True, + ) @pytest.mark.parametrize("ltype", [Decimal64Dtype(3, 1), Decimal64Dtype(7, 2)]) @@ -1277,7 +1320,7 @@ def test_concat_decimal_dataframe(ltype, rtype): got = gd.concat([gdf1, gdf2]) expected = pd.concat([pdf1, pdf2]) - assert_eq(expected, got) + assert_eq(expected, got, check_index_type=True) @pytest.mark.parametrize("ltype", [Decimal64Dtype(4, 1), Decimal64Dtype(8, 2)]) @@ -1294,7 +1337,7 @@ def test_concat_decimal_series(ltype, rtype): got = gd.concat([gs1, gs2]) expected = pd.concat([ps1, ps2]) - assert_eq(expected, got) + assert_eq(expected, got, check_index_type=True) @pytest.mark.parametrize( @@ -1395,7 +1438,7 @@ def test_concat_decimal_series(ltype, rtype): ) def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): df = gd.concat([df1, df2, df3]) - assert_eq(df, expected) + assert_eq(df, expected, check_index_type=True) assert_eq(df.val.dtype, expected.val.dtype) @@ -1487,7 +1530,7 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): ) def test_concat_decimal_numeric_series(s1, s2, s3, expected): s = gd.concat([s1, s2, s3]) - assert_eq(s, expected) + assert_eq(s, expected, check_index_type=True) @pytest.mark.parametrize( @@ -1558,7 +1601,7 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected): ) def test_concat_decimal_non_numeric(s1, s2, expected): s = gd.concat([s1, s2]) - assert_eq(s, expected) + assert_eq(s, expected, check_index_type=True) @pytest.mark.parametrize( @@ -1581,4 +1624,4 @@ def test_concat_decimal_non_numeric(s1, s2, expected): ) def test_concat_struct_column(s1, s2, expected): s = gd.concat([s1, s2]) - assert_eq(s, expected) + assert_eq(s, expected, check_index_type=True) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index dca18207e54..2eb59616253 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1487,7 +1487,7 @@ def test_csv_writer_file_append(tmpdir): result = cudf.read_csv(gdf_df_fname) expected = cudf.concat([gdf1, gdf2], ignore_index=True) - assert_eq(result, expected) + assert_eq(result, expected, check_index_type=True) def test_csv_writer_buffer(tmpdir): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 3b74fe91e05..03a140c138b 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1241,7 +1241,7 @@ def test_dataframe_concat_different_numerical_columns(dtype1, dtype2): else: pres = pd.concat([df1, df2]) gres = cudf.concat([cudf.from_pandas(df1), cudf.from_pandas(df2)]) - assert_eq(pres, gres, check_dtype=False) + assert_eq(pres, gres, check_dtype=False, check_index_type=True) def test_dataframe_concat_different_column_types(): @@ -1269,7 +1269,7 @@ def test_concat_empty_dataframe(df_1, df_2): # ignoring dtypes as pandas upcasts int to float # on concatenation with empty dataframes - assert_eq(got, expect, check_dtype=False) + assert_eq(got, expect, check_dtype=False, check_index_type=True) @pytest.mark.parametrize( @@ -1307,7 +1307,7 @@ def test_concat_different_column_dataframe(df1_d, df2_d): for col in numeric_cols: got[col] = got[col].astype(np.float64).fillna(np.nan) - assert_eq(got, expect, check_dtype=False) + assert_eq(got, expect, check_dtype=False, check_index_type=True) @pytest.mark.parametrize( @@ -1318,7 +1318,7 @@ def test_concat_empty_series(ser_1, ser_2): got = cudf.concat([cudf.Series(ser_1), cudf.Series(ser_2)]) expect = pd.concat([ser_1, ser_2]) - assert_eq(got, expect) + assert_eq(got, expect, check_index_type=True) def test_concat_with_axis(): @@ -1331,7 +1331,7 @@ def test_concat_with_axis(): # concat only dataframes concat_cdf = cudf.concat([cdf1, cdf2], axis=1) - assert_eq(concat_cdf, concat_df) + assert_eq(concat_cdf, concat_df, check_index_type=True) # concat only series concat_s = pd.concat([df1.x, df1.y], axis=1) @@ -1339,7 +1339,7 @@ def test_concat_with_axis(): cs2 = cudf.Series.from_pandas(df1.y) concat_cdf_s = cudf.concat([cs1, cs2], axis=1) - assert_eq(concat_cdf_s, concat_s) + assert_eq(concat_cdf_s, concat_s, check_index_type=True) # concat series and dataframes s3 = pd.Series(np.random.random(5)) @@ -1347,7 +1347,7 @@ def test_concat_with_axis(): concat_cdf_all = cudf.concat([cdf1, cs3, cdf2], axis=1) concat_df_all = pd.concat([df1, s3, df2], axis=1) - assert_eq(concat_cdf_all, concat_df_all) + assert_eq(concat_cdf_all, concat_df_all, check_index_type=True) # concat manual multi index midf1 = cudf.from_pandas(df1) @@ -1361,10 +1361,20 @@ def test_concat_with_axis(): mipdf1 = midf1.to_pandas() mipdf2 = midf2.to_pandas() - assert_eq(cudf.concat([midf1, midf2]), pd.concat([mipdf1, mipdf2])) - assert_eq(cudf.concat([midf2, midf1]), pd.concat([mipdf2, mipdf1])) assert_eq( - cudf.concat([midf1, midf2, midf1]), pd.concat([mipdf1, mipdf2, mipdf1]) + cudf.concat([midf1, midf2]), + pd.concat([mipdf1, mipdf2]), + check_index_type=True, + ) + assert_eq( + cudf.concat([midf2, midf1]), + pd.concat([mipdf2, mipdf1]), + check_index_type=True, + ) + assert_eq( + cudf.concat([midf1, midf2, midf1]), + pd.concat([mipdf1, mipdf2, mipdf1]), + check_index_type=True, ) # concat groupby multi index @@ -1382,8 +1392,16 @@ def test_concat_with_axis(): pdg1 = gdg1.to_pandas() pdg2 = gdg2.to_pandas() - assert_eq(cudf.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2])) - assert_eq(cudf.concat([gdg2, gdg1]), pd.concat([pdg2, pdg1])) + assert_eq( + cudf.concat([gdg1, gdg2]), + pd.concat([pdg1, pdg2]), + check_index_type=True, + ) + assert_eq( + cudf.concat([gdg2, gdg1]), + pd.concat([pdg2, pdg1]), + check_index_type=True, + ) # series multi index concat gdgz1 = gdg1.z @@ -1391,8 +1409,16 @@ def test_concat_with_axis(): pdgz1 = gdgz1.to_pandas() pdgz2 = gdgz2.to_pandas() - assert_eq(cudf.concat([gdgz1, gdgz2]), pd.concat([pdgz1, pdgz2])) - assert_eq(cudf.concat([gdgz2, gdgz1]), pd.concat([pdgz2, pdgz1])) + assert_eq( + cudf.concat([gdgz1, gdgz2]), + pd.concat([pdgz1, pdgz2]), + check_index_type=True, + ) + assert_eq( + cudf.concat([gdgz2, gdgz1]), + pd.concat([pdgz2, pdgz1]), + check_index_type=True, + ) @pytest.mark.parametrize("nrows", [0, 3, 10, 100, 1000]) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index d913722a17e..2d4dc55bd28 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -260,7 +260,7 @@ def test_orc_read_stripes(datadir, engine): for i in range(stripes) ] gdf = cudf.concat(gdf).reset_index(drop=True) - assert_eq(pdf, gdf, check_categorical=False) + assert_eq(pdf, gdf, check_categorical=False, check_index_type=True) # Read stripes all at once gdf = cudf.read_orc( @@ -273,7 +273,9 @@ def test_orc_read_stripes(datadir, engine): assert_eq(gdf, pdf.head(25000)) gdf = cudf.read_orc(path, engine=engine, stripes=[[0, stripes - 1]]) assert_eq( - gdf, cudf.concat([pdf.head(15000), pdf.tail(10000)], ignore_index=True) + gdf, + cudf.concat([pdf.head(15000), pdf.tail(10000)], ignore_index=True), + check_index_type=True, ) diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py index 07c6cce5cd3..2e8de9b5d50 100644 --- a/python/cudf/cudf/tests/test_query.py +++ b/python/cudf/cudf/tests/test_query.py @@ -141,7 +141,7 @@ def test_query_splitted_combine(): # Should equal to just querying the original GDF expect = gdf.query(expr).to_pandas() - assert_eq(got, expect) + assert_eq(got, expect, check_index_type=True) def test_query_empty_frames(): From f3f134dd6c3406a7e2d6362765e40291319c6965 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 13 Sep 2021 15:09:33 -0500 Subject: [PATCH 2/2] Update dataframe.py --- python/cudf/cudf/core/dataframe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4ab9879e9ee..0da31cfe583 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1165,7 +1165,6 @@ def _concat( + [f._data[name] if name in f._data else None for name in names] for f in objs ] - # import pdb;pdb.set_trace() # Get a list of the combined index and table column indices indices = list(range(functools.reduce(max, map(len, columns))))