Merge pull request #14434 from rapidsai/branch-23.10

[HOTFIX] v23.10.02
rapidsai · Nov 16, 2023 · 1304f94 · 1304f94
2 parents 1a0c076 + 4dc8300
commit 1304f94
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 21 deletions.
diff --git a/docs/cudf/source/_static/duckdb-benchmark-groupby-join.png b/docs/cudf/source/_static/duckdb-benchmark-groupby-join.png
diff --git a/docs/cudf/source/cudf_pandas/benchmarks.md b/docs/cudf/source/cudf_pandas/benchmarks.md
@@ -6,9 +6,9 @@ We reproduced the [Database-like ops benchmark](https://duckdblabs.github.io/db-
 including a solution using `cudf.pandas`. Here are the results:
 
 <figure>
-<img src="../_static/duckdb-benchmark-groupby-join.png"
-class="align-center" width="750"
-alt="_static/duckdb-benchmark-groupby-join.png" />
+
+![duckdb-benchmark-groupby-join](../_static/duckdb-benchmark-groupby-join.png)
+
 <figcaption style="text-align: center;">Results of the <a
 href="https://duckdblabs.github.io/db-benchmark/">Database-like ops
 benchmark</a> including <span
@@ -46,7 +46,7 @@ source pandas/py-pandas/bin/activate
 pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12  # or cudf-cu11
 ```
 
-5. Modify pandas join/group code to use `cudf.pandas`:
+5. Modify pandas join/group code to use `cudf.pandas` and be compatible with pandas 1.5 APIs:
 
 ```bash
 diff --git a/pandas/groupby-pandas.py b/pandas/groupby-pandas.py
@@ -59,15 +59,46 @@ index 58eeb26..2ddb209 100755
 
  print("# groupby-pandas.py", flush=True)
 
- diff --git a/pandas/join-pandas.py b/pandas/join-pandas.py
- index f39beb0..655dd82 100755
- --- a/pandas/join-pandas.py
- +++ b/pandas/join-pandas.py
- @@ -1,4 +1,4 @@
- -#!/usr/bin/env python3
- +#!/usr/bin/env -S python3 -m cudf.pandas
+diff --git a/pandas/join-pandas.py b/pandas/join-pandas.py
+index f39beb0..a9ad651 100755
+--- a/pandas/join-pandas.py
++++ b/pandas/join-pandas.py
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env python3
++#!/usr/bin/env -S python3 -m cudf.pandas
 
-  print("# join-pandas.py", flush=True)
+ print("# join-pandas.py", flush=True)
+
+@@ -26,7 +26,7 @@ if len(src_jn_y) != 3:
+
+ print("loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[1] + ", " + y_data_name[2], flush=True)
+
+-x = pd.read_csv(src_jn_x, engine='pyarrow', dtype_backend='pyarrow')
++x = pd.read_csv(src_jn_x, engine='pyarrow')
+
+ # x['id1'] = x['id1'].astype('Int32')
+ # x['id2'] = x['id2'].astype('Int32')
+@@ -35,17 +35,17 @@ x['id4'] = x['id4'].astype('category') # remove after datatable#1691
+ x['id5'] = x['id5'].astype('category')
+ x['id6'] = x['id6'].astype('category')
+
+-small = pd.read_csv(src_jn_y[0], engine='pyarrow', dtype_backend='pyarrow')
++small = pd.read_csv(src_jn_y[0], engine='pyarrow')
+ # small['id1'] = small['id1'].astype('Int32')
+ small['id4'] = small['id4'].astype('category')
+ # small['v2'] = small['v2'].astype('float64')
+-medium = pd.read_csv(src_jn_y[1], engine='pyarrow', dtype_backend='pyarrow')
++medium = pd.read_csv(src_jn_y[1], engine='pyarrow')
+ # medium['id1'] = medium['id1'].astype('Int32')
+ # medium['id2'] = medium['id2'].astype('Int32')
+ medium['id4'] = medium['id4'].astype('category')
+ medium['id5'] = medium['id5'].astype('category')
+ # medium['v2'] = medium['v2'].astype('float64')
+-big = pd.read_csv(src_jn_y[2], engine='pyarrow', dtype_backend='pyarrow')
++big = pd.read_csv(src_jn_y[2], engine='pyarrow')
+ # big['id1'] = big['id1'].astype('Int32')
+ # big['id2'] = big['id2'].astype('Int32')
+ # big['id3'] = big['id3'].astype('Int32')
 ```
 
 6. Run Modified pandas benchmarks:

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -2606,6 +2606,10 @@ def _reindex(
 
         df = self
         if index is not None:
+            if not df._index.is_unique:
+                raise ValueError(
+                    "cannot reindex on an axis with duplicate labels"
+                )
             index = cudf.core.index.as_index(
                 index, name=getattr(index, "name", self._index.name)
             )

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -10723,3 +10723,15 @@ def test_dataframe_series_dot():
     expected = gser @ [12, 13]
 
     assert_eq(expected, actual)
+
+
+def test_dataframe_duplicate_index_reindex():
+    gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1])
+    pdf = gdf.to_pandas()
+
+    assert_exceptions_equal(
+        gdf.reindex,
+        pdf.reindex,
+        lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+        rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+    )
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
@@ -2638,3 +2638,15 @@ def test_series_setitem_mixed_bool_dtype():
     s = cudf.Series([True, False, True])
     with pytest.raises(TypeError):
         s[0] = 10
+
+
+def test_series_duplicate_index_reindex():
+    gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1])
+    ps = gs.to_pandas()
+
+    assert_exceptions_equal(
+        gs.reindex,
+        ps.reindex,
+        lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+        rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+    )
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
@@ -437,28 +437,23 @@ def union_categoricals_cudf(
     )
 
 
-@_dask_cudf_nvtx_annotate
-def safe_hash(frame):
-    return cudf.Series(frame.hash_values(), index=frame.index)
-
-
 @hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
 @_dask_cudf_nvtx_annotate
 def hash_object_cudf(frame, index=True):
     if index:
-        return safe_hash(frame.reset_index())
-    return safe_hash(frame)
+        frame = frame.reset_index()
+    return frame.hash_values()
 
 
 @hash_object_dispatch.register(cudf.BaseIndex)
 @_dask_cudf_nvtx_annotate
 def hash_object_cudf_index(ind, index=None):
 
     if isinstance(ind, cudf.MultiIndex):
-        return safe_hash(ind.to_frame(index=False))
+        return ind.to_frame(index=False).hash_values()
 
     col = cudf.core.column.as_column(ind)
-    return safe_hash(cudf.Series(col))
+    return cudf.Series(col).hash_values()
 
 
 @group_split_dispatch.register((cudf.Series, cudf.DataFrame))