From 08a816c7b1398c9d3c21a97baa668aecaad02021 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 13 Sep 2021 13:06:35 -0700
Subject: [PATCH 1/2] optimize concat

---
 python/cudf/cudf/core/dataframe.py       | 21 +++++-
 python/cudf/cudf/tests/test_concat.py    | 83 ++++++++++++++++++------
 python/cudf/cudf/tests/test_csv.py       |  2 +-
 python/cudf/cudf/tests/test_dataframe.py | 54 +++++++++++----
 python/cudf/cudf/tests/test_orc.py       |  6 +-
 python/cudf/cudf/tests/test_query.py     |  2 +-
 6 files changed, 128 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 4239a55118f..4ab9879e9ee 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1058,6 +1058,10 @@ def _concat(
         # the number of empty input frames
         num_empty_input_frames = 0
 
+        # flag to indicate if all DataFrame's have
+        # RangeIndex as their index
+        are_all_range_index = False
+
         for i, obj in enumerate(objs):
             # shallow-copy the input DFs in case the same DF instance
             # is concatenated with itself
@@ -1076,6 +1080,10 @@ def _concat(
                 result_index_length += len(obj)
                 empty_has_index = empty_has_index or len(obj) > 0
 
+            are_all_range_index = (
+                True if i == 0 else are_all_range_index
+            ) and isinstance(obj.index, cudf.RangeIndex)
+
         if join == "inner":
             sets_of_column_names = [set(obj._column_names) for obj in objs]
 
@@ -1150,12 +1158,14 @@ def _concat(
         columns = [
             (
                 []
-                if (ignore_index and not empty_has_index)
+                if are_all_range_index
+                or (ignore_index and not empty_has_index)
                 else list(f._index._data.columns)
             )
             + [f._data[name] if name in f._data else None for name in names]
             for f in objs
         ]
+        # import pdb;pdb.set_trace()
 
         # Get a list of the combined index and table column indices
         indices = list(range(functools.reduce(max, map(len, columns))))
@@ -1205,7 +1215,9 @@ def _concat(
 
         # Concatenate the Tables
         out = cls._from_data(
-            *libcudf.concat.concat_tables(tables, ignore_index)
+            *libcudf.concat.concat_tables(
+                tables, ignore_index=ignore_index or are_all_range_index
+            )
         )
 
         # If ignore_index is True, all input frames are empty, and at
@@ -1213,6 +1225,11 @@ def _concat(
         # to the result frame.
         if empty_has_index and num_empty_input_frames == len(objs):
             out._index = cudf.RangeIndex(result_index_length)
+        elif are_all_range_index and not ignore_index:
+            out._index = cudf.core.index.GenericIndex._concat(
+                [o._index for o in objs]
+            )
+
         # Reassign the categories for any categorical table cols
         _reassign_categories(
             categories, out._data, indices[first_data_column_position:]
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 3983e8a5f4a..5f2273156ed 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -64,13 +64,25 @@ def test_concat_dataframe(index, nulls, axis):
     # DataFrame
     res = gd.concat([gdf, gdf2, gdf, gdf_empty1], axis=axis).to_pandas()
     sol = pd.concat([df, df2, df, df_empty1], axis=axis)
-    assert_eq(res, sol, check_names=False, check_categorical=False)
+    assert_eq(
+        res,
+        sol,
+        check_names=False,
+        check_categorical=False,
+        check_index_type=True,
+    )
 
     # Series
     for c in [i for i in ("x", "y", "z") if i != index]:
         res = gd.concat([gdf[c], gdf2[c], gdf[c]], axis=axis).to_pandas()
         sol = pd.concat([df[c], df2[c], df[c]], axis=axis)
-        assert_eq(res, sol, check_names=False, check_categorical=False)
+        assert_eq(
+            res,
+            sol,
+            check_names=False,
+            check_categorical=False,
+            check_index_type=True,
+        )
 
     # Index
     res = gd.concat([gdf.index, gdf2.index], axis=axis).to_pandas()
@@ -91,7 +103,13 @@ def test_concat_all_nulls(values):
     gb = gd.Series([None])
     gs = gd.concat([ga, gb])
 
-    assert_eq(ps, gs, check_dtype=False, check_categorical=False)
+    assert_eq(
+        ps,
+        gs,
+        check_dtype=False,
+        check_categorical=False,
+        check_index_type=True,
+    )
 
 
 def test_concat_errors():
@@ -167,7 +185,13 @@ def test_concat_misordered_columns():
     res = gd.concat([gdf, gdf2]).to_pandas()
     sol = pd.concat([df, df2], sort=False)
 
-    assert_eq(res, sol, check_names=False, check_categorical=False)
+    assert_eq(
+        res,
+        sol,
+        check_names=False,
+        check_categorical=False,
+        check_index_type=True,
+    )
 
 
 @pytest.mark.parametrize("axis", [1, "columns"])
@@ -182,7 +206,7 @@ def test_concat_columns(axis):
     expect = pd.concat([pdf1, pdf2], axis=axis)
     got = gd.concat([gdf1, gdf2], axis=axis)
 
-    assert_eq(expect, got)
+    assert_eq(expect, got, check_index_type=True)
 
 
 def test_concat_multiindex_dataframe():
@@ -201,7 +225,9 @@ def test_concat_multiindex_dataframe():
     gdg1 = gd.from_pandas(pdg1)
     gdg2 = gd.from_pandas(pdg2)
     assert_eq(
-        gd.concat([gdg1, gdg2]).astype("float64"), pd.concat([pdg1, pdg2])
+        gd.concat([gdg1, gdg2]).astype("float64"),
+        pd.concat([pdg1, pdg2]),
+        check_index_type=True,
     )
     assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1))
 
@@ -221,7 +247,9 @@ def test_concat_multiindex_series():
     pdg2 = pdg["z"]
     gdg1 = gd.from_pandas(pdg1)
     gdg2 = gd.from_pandas(pdg2)
-    assert_eq(gd.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2]))
+    assert_eq(
+        gd.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2]), check_index_type=True
+    )
     assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1))
 
 
@@ -363,10 +391,19 @@ def test_concat_mixed_input():
     assert_eq(
         pd.concat([pdf1, None, pdf2, None]),
         gd.concat([gdf1, None, gdf2, None]),
+        check_index_type=True,
+    )
+    assert_eq(
+        pd.concat([pdf1, None]), gd.concat([gdf1, None]), check_index_type=True
+    )
+    assert_eq(
+        pd.concat([None, pdf2]), gd.concat([None, gdf2]), check_index_type=True
+    )
+    assert_eq(
+        pd.concat([None, pdf2, pdf1]),
+        gd.concat([None, gdf2, gdf1]),
+        check_index_type=True,
     )
-    assert_eq(pd.concat([pdf1, None]), gd.concat([gdf1, None]))
-    assert_eq(pd.concat([None, pdf2]), gd.concat([None, gdf2]))
-    assert_eq(pd.concat([None, pdf2, pdf1]), gd.concat([None, gdf2, gdf1]))
 
 
 @pytest.mark.parametrize(
@@ -540,7 +577,7 @@ def test_concat_empty_dataframes(df, other, ignore_index):
             else:
                 expected[key] = expected[key].fillna(-1)
                 actual[key] = col.fillna(-1)
-        assert_eq(expected, actual, check_dtype=False)
+        assert_eq(expected, actual, check_dtype=False, check_index_type=True)
     else:
         assert_eq(
             expected, actual, check_index_type=False if gdf.empty else True
@@ -564,7 +601,7 @@ def test_concat_empty_and_nonempty_series(ignore_index, data, axis):
     got = gd.concat([s1, s2], axis=axis, ignore_index=ignore_index)
     expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index)
 
-    assert_eq(got, expect)
+    assert_eq(got, expect, check_index_type=True)
 
 
 @pytest.mark.parametrize("ignore_index", [True, False])
@@ -577,7 +614,7 @@ def test_concat_two_empty_series(ignore_index, axis):
     got = gd.concat([s1, s2], axis=axis, ignore_index=ignore_index)
     expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index)
 
-    assert_eq(got, expect)
+    assert_eq(got, expect, check_index_type=True)
 
 
 @pytest.mark.parametrize(
@@ -670,6 +707,7 @@ def test_concat_join(objs, ignore_index, sort, join, axis):
             ignore_index=ignore_index,
             axis=axis,
         ),
+        check_index_type=True,
     )
 
 
@@ -1247,6 +1285,7 @@ def test_concat_preserve_order():
     assert_eq(
         pd.concat(dfs, join="inner"),
         gd.concat([gd.DataFrame(df) for df in dfs], join="inner"),
+        check_index_type=True,
     )
 
 
@@ -1255,7 +1294,11 @@ def test_concat_preserve_order():
 def test_concat_single_object(ignore_index, typ):
     """Ensure that concat on a single object does not change it."""
     obj = typ([1, 2, 3])
-    assert_eq(gd.concat([obj], ignore_index=ignore_index, axis=0), obj)
+    assert_eq(
+        gd.concat([obj], ignore_index=ignore_index, axis=0),
+        obj,
+        check_index_type=True,
+    )
 
 
 @pytest.mark.parametrize("ltype", [Decimal64Dtype(3, 1), Decimal64Dtype(7, 2)])
@@ -1277,7 +1320,7 @@ def test_concat_decimal_dataframe(ltype, rtype):
     got = gd.concat([gdf1, gdf2])
     expected = pd.concat([pdf1, pdf2])
 
-    assert_eq(expected, got)
+    assert_eq(expected, got, check_index_type=True)
 
 
 @pytest.mark.parametrize("ltype", [Decimal64Dtype(4, 1), Decimal64Dtype(8, 2)])
@@ -1294,7 +1337,7 @@ def test_concat_decimal_series(ltype, rtype):
     got = gd.concat([gs1, gs2])
     expected = pd.concat([ps1, ps2])
 
-    assert_eq(expected, got)
+    assert_eq(expected, got, check_index_type=True)
 
 
 @pytest.mark.parametrize(
@@ -1395,7 +1438,7 @@ def test_concat_decimal_series(ltype, rtype):
 )
 def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected):
     df = gd.concat([df1, df2, df3])
-    assert_eq(df, expected)
+    assert_eq(df, expected, check_index_type=True)
     assert_eq(df.val.dtype, expected.val.dtype)
 
 
@@ -1487,7 +1530,7 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected):
 )
 def test_concat_decimal_numeric_series(s1, s2, s3, expected):
     s = gd.concat([s1, s2, s3])
-    assert_eq(s, expected)
+    assert_eq(s, expected, check_index_type=True)
 
 
 @pytest.mark.parametrize(
@@ -1558,7 +1601,7 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected):
 )
 def test_concat_decimal_non_numeric(s1, s2, expected):
     s = gd.concat([s1, s2])
-    assert_eq(s, expected)
+    assert_eq(s, expected, check_index_type=True)
 
 
 @pytest.mark.parametrize(
@@ -1581,4 +1624,4 @@ def test_concat_decimal_non_numeric(s1, s2, expected):
 )
 def test_concat_struct_column(s1, s2, expected):
     s = gd.concat([s1, s2])
-    assert_eq(s, expected)
+    assert_eq(s, expected, check_index_type=True)
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index dca18207e54..2eb59616253 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1487,7 +1487,7 @@ def test_csv_writer_file_append(tmpdir):
 
     result = cudf.read_csv(gdf_df_fname)
     expected = cudf.concat([gdf1, gdf2], ignore_index=True)
-    assert_eq(result, expected)
+    assert_eq(result, expected, check_index_type=True)
 
 
 def test_csv_writer_buffer(tmpdir):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 3b74fe91e05..03a140c138b 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1241,7 +1241,7 @@ def test_dataframe_concat_different_numerical_columns(dtype1, dtype2):
     else:
         pres = pd.concat([df1, df2])
         gres = cudf.concat([cudf.from_pandas(df1), cudf.from_pandas(df2)])
-        assert_eq(pres, gres, check_dtype=False)
+        assert_eq(pres, gres, check_dtype=False, check_index_type=True)
 
 
 def test_dataframe_concat_different_column_types():
@@ -1269,7 +1269,7 @@ def test_concat_empty_dataframe(df_1, df_2):
     # ignoring dtypes as pandas upcasts int to float
     # on concatenation with empty dataframes
 
-    assert_eq(got, expect, check_dtype=False)
+    assert_eq(got, expect, check_dtype=False, check_index_type=True)
 
 
 @pytest.mark.parametrize(
@@ -1307,7 +1307,7 @@ def test_concat_different_column_dataframe(df1_d, df2_d):
     for col in numeric_cols:
         got[col] = got[col].astype(np.float64).fillna(np.nan)
 
-    assert_eq(got, expect, check_dtype=False)
+    assert_eq(got, expect, check_dtype=False, check_index_type=True)
 
 
 @pytest.mark.parametrize(
@@ -1318,7 +1318,7 @@ def test_concat_empty_series(ser_1, ser_2):
     got = cudf.concat([cudf.Series(ser_1), cudf.Series(ser_2)])
     expect = pd.concat([ser_1, ser_2])
 
-    assert_eq(got, expect)
+    assert_eq(got, expect, check_index_type=True)
 
 
 def test_concat_with_axis():
@@ -1331,7 +1331,7 @@ def test_concat_with_axis():
 
     # concat only dataframes
     concat_cdf = cudf.concat([cdf1, cdf2], axis=1)
-    assert_eq(concat_cdf, concat_df)
+    assert_eq(concat_cdf, concat_df, check_index_type=True)
 
     # concat only series
     concat_s = pd.concat([df1.x, df1.y], axis=1)
@@ -1339,7 +1339,7 @@ def test_concat_with_axis():
     cs2 = cudf.Series.from_pandas(df1.y)
     concat_cdf_s = cudf.concat([cs1, cs2], axis=1)
 
-    assert_eq(concat_cdf_s, concat_s)
+    assert_eq(concat_cdf_s, concat_s, check_index_type=True)
 
     # concat series and dataframes
     s3 = pd.Series(np.random.random(5))
@@ -1347,7 +1347,7 @@ def test_concat_with_axis():
 
     concat_cdf_all = cudf.concat([cdf1, cs3, cdf2], axis=1)
     concat_df_all = pd.concat([df1, s3, df2], axis=1)
-    assert_eq(concat_cdf_all, concat_df_all)
+    assert_eq(concat_cdf_all, concat_df_all, check_index_type=True)
 
     # concat manual multi index
     midf1 = cudf.from_pandas(df1)
@@ -1361,10 +1361,20 @@ def test_concat_with_axis():
     mipdf1 = midf1.to_pandas()
     mipdf2 = midf2.to_pandas()
 
-    assert_eq(cudf.concat([midf1, midf2]), pd.concat([mipdf1, mipdf2]))
-    assert_eq(cudf.concat([midf2, midf1]), pd.concat([mipdf2, mipdf1]))
     assert_eq(
-        cudf.concat([midf1, midf2, midf1]), pd.concat([mipdf1, mipdf2, mipdf1])
+        cudf.concat([midf1, midf2]),
+        pd.concat([mipdf1, mipdf2]),
+        check_index_type=True,
+    )
+    assert_eq(
+        cudf.concat([midf2, midf1]),
+        pd.concat([mipdf2, mipdf1]),
+        check_index_type=True,
+    )
+    assert_eq(
+        cudf.concat([midf1, midf2, midf1]),
+        pd.concat([mipdf1, mipdf2, mipdf1]),
+        check_index_type=True,
     )
 
     # concat groupby multi index
@@ -1382,8 +1392,16 @@ def test_concat_with_axis():
     pdg1 = gdg1.to_pandas()
     pdg2 = gdg2.to_pandas()
 
-    assert_eq(cudf.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2]))
-    assert_eq(cudf.concat([gdg2, gdg1]), pd.concat([pdg2, pdg1]))
+    assert_eq(
+        cudf.concat([gdg1, gdg2]),
+        pd.concat([pdg1, pdg2]),
+        check_index_type=True,
+    )
+    assert_eq(
+        cudf.concat([gdg2, gdg1]),
+        pd.concat([pdg2, pdg1]),
+        check_index_type=True,
+    )
 
     # series multi index concat
     gdgz1 = gdg1.z
@@ -1391,8 +1409,16 @@ def test_concat_with_axis():
     pdgz1 = gdgz1.to_pandas()
     pdgz2 = gdgz2.to_pandas()
 
-    assert_eq(cudf.concat([gdgz1, gdgz2]), pd.concat([pdgz1, pdgz2]))
-    assert_eq(cudf.concat([gdgz2, gdgz1]), pd.concat([pdgz2, pdgz1]))
+    assert_eq(
+        cudf.concat([gdgz1, gdgz2]),
+        pd.concat([pdgz1, pdgz2]),
+        check_index_type=True,
+    )
+    assert_eq(
+        cudf.concat([gdgz2, gdgz1]),
+        pd.concat([pdgz2, pdgz1]),
+        check_index_type=True,
+    )
 
 
 @pytest.mark.parametrize("nrows", [0, 3, 10, 100, 1000])
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index d913722a17e..2d4dc55bd28 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -260,7 +260,7 @@ def test_orc_read_stripes(datadir, engine):
         for i in range(stripes)
     ]
     gdf = cudf.concat(gdf).reset_index(drop=True)
-    assert_eq(pdf, gdf, check_categorical=False)
+    assert_eq(pdf, gdf, check_categorical=False, check_index_type=True)
 
     # Read stripes all at once
     gdf = cudf.read_orc(
@@ -273,7 +273,9 @@ def test_orc_read_stripes(datadir, engine):
     assert_eq(gdf, pdf.head(25000))
     gdf = cudf.read_orc(path, engine=engine, stripes=[[0, stripes - 1]])
     assert_eq(
-        gdf, cudf.concat([pdf.head(15000), pdf.tail(10000)], ignore_index=True)
+        gdf,
+        cudf.concat([pdf.head(15000), pdf.tail(10000)], ignore_index=True),
+        check_index_type=True,
     )
 
 
diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py
index 07c6cce5cd3..2e8de9b5d50 100644
--- a/python/cudf/cudf/tests/test_query.py
+++ b/python/cudf/cudf/tests/test_query.py
@@ -141,7 +141,7 @@ def test_query_splitted_combine():
 
     # Should equal to just querying the original GDF
     expect = gdf.query(expr).to_pandas()
-    assert_eq(got, expect)
+    assert_eq(got, expect, check_index_type=True)
 
 
 def test_query_empty_frames():

From f3f134dd6c3406a7e2d6362765e40291319c6965 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 13 Sep 2021 15:09:33 -0500
Subject: [PATCH 2/2] Update dataframe.py

---
 python/cudf/cudf/core/dataframe.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 4ab9879e9ee..0da31cfe583 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1165,7 +1165,6 @@ def _concat(
             + [f._data[name] if name in f._data else None for name in names]
             for f in objs
         ]
-        # import pdb;pdb.set_trace()
 
         # Get a list of the combined index and table column indices
         indices = list(range(functools.reduce(max, map(len, columns))))