Skip to content

Commit

Permalink
Merge pull request #14434 from rapidsai/branch-23.10
Browse files Browse the repository at this point in the history
[HOTFIX] v23.10.02
  • Loading branch information
raydouglass authored Nov 16, 2023
2 parents 1a0c076 + 4dc8300 commit 1304f94
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 21 deletions.
Binary file modified docs/cudf/source/_static/duckdb-benchmark-groupby-join.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
55 changes: 43 additions & 12 deletions docs/cudf/source/cudf_pandas/benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ We reproduced the [Database-like ops benchmark](https://duckdblabs.github.io/db-
including a solution using `cudf.pandas`. Here are the results:

<figure>
<img src="../_static/duckdb-benchmark-groupby-join.png"
class="align-center" width="750"
alt="_static/duckdb-benchmark-groupby-join.png" />

![duckdb-benchmark-groupby-join](../_static/duckdb-benchmark-groupby-join.png)

<figcaption style="text-align: center;">Results of the <a
href="https://duckdblabs.github.io/db-benchmark/">Database-like ops
benchmark</a> including <span
Expand Down Expand Up @@ -46,7 +46,7 @@ source pandas/py-pandas/bin/activate
pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12 # or cudf-cu11
```

5. Modify pandas join/group code to use `cudf.pandas`:
5. Modify pandas join/group code to use `cudf.pandas` and be compatible with pandas 1.5 APIs:

```bash
diff --git a/pandas/groupby-pandas.py b/pandas/groupby-pandas.py
Expand All @@ -59,15 +59,46 @@ index 58eeb26..2ddb209 100755

print("# groupby-pandas.py", flush=True)

diff --git a/pandas/join-pandas.py b/pandas/join-pandas.py
index f39beb0..655dd82 100755
--- a/pandas/join-pandas.py
+++ b/pandas/join-pandas.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/env -S python3 -m cudf.pandas
diff --git a/pandas/join-pandas.py b/pandas/join-pandas.py
index f39beb0..a9ad651 100755
--- a/pandas/join-pandas.py
+++ b/pandas/join-pandas.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/env -S python3 -m cudf.pandas

print("# join-pandas.py", flush=True)
print("# join-pandas.py", flush=True)

@@ -26,7 +26,7 @@ if len(src_jn_y) != 3:

print("loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[1] + ", " + y_data_name[2], flush=True)

-x = pd.read_csv(src_jn_x, engine='pyarrow', dtype_backend='pyarrow')
+x = pd.read_csv(src_jn_x, engine='pyarrow')

# x['id1'] = x['id1'].astype('Int32')
# x['id2'] = x['id2'].astype('Int32')
@@ -35,17 +35,17 @@ x['id4'] = x['id4'].astype('category') # remove after datatable#1691
x['id5'] = x['id5'].astype('category')
x['id6'] = x['id6'].astype('category')

-small = pd.read_csv(src_jn_y[0], engine='pyarrow', dtype_backend='pyarrow')
+small = pd.read_csv(src_jn_y[0], engine='pyarrow')
# small['id1'] = small['id1'].astype('Int32')
small['id4'] = small['id4'].astype('category')
# small['v2'] = small['v2'].astype('float64')
-medium = pd.read_csv(src_jn_y[1], engine='pyarrow', dtype_backend='pyarrow')
+medium = pd.read_csv(src_jn_y[1], engine='pyarrow')
# medium['id1'] = medium['id1'].astype('Int32')
# medium['id2'] = medium['id2'].astype('Int32')
medium['id4'] = medium['id4'].astype('category')
medium['id5'] = medium['id5'].astype('category')
# medium['v2'] = medium['v2'].astype('float64')
-big = pd.read_csv(src_jn_y[2], engine='pyarrow', dtype_backend='pyarrow')
+big = pd.read_csv(src_jn_y[2], engine='pyarrow')
# big['id1'] = big['id1'].astype('Int32')
# big['id2'] = big['id2'].astype('Int32')
# big['id3'] = big['id3'].astype('Int32')
```
6. Run Modified pandas benchmarks:
Expand Down
4 changes: 4 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2606,6 +2606,10 @@ def _reindex(

df = self
if index is not None:
if not df._index.is_unique:
raise ValueError(
"cannot reindex on an axis with duplicate labels"
)
index = cudf.core.index.as_index(
index, name=getattr(index, "name", self._index.name)
)
Expand Down
12 changes: 12 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10723,3 +10723,15 @@ def test_dataframe_series_dot():
expected = gser @ [12, 13]

assert_eq(expected, actual)


def test_dataframe_duplicate_index_reindex():
gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1])
pdf = gdf.to_pandas()

assert_exceptions_equal(
gdf.reindex,
pdf.reindex,
lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
)
12 changes: 12 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2638,3 +2638,15 @@ def test_series_setitem_mixed_bool_dtype():
s = cudf.Series([True, False, True])
with pytest.raises(TypeError):
s[0] = 10


def test_series_duplicate_index_reindex():
gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1])
ps = gs.to_pandas()

assert_exceptions_equal(
gs.reindex,
ps.reindex,
lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
)
13 changes: 4 additions & 9 deletions python/dask_cudf/dask_cudf/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,28 +437,23 @@ def union_categoricals_cudf(
)


@_dask_cudf_nvtx_annotate
def safe_hash(frame):
return cudf.Series(frame.hash_values(), index=frame.index)


@hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
@_dask_cudf_nvtx_annotate
def hash_object_cudf(frame, index=True):
if index:
return safe_hash(frame.reset_index())
return safe_hash(frame)
frame = frame.reset_index()
return frame.hash_values()


@hash_object_dispatch.register(cudf.BaseIndex)
@_dask_cudf_nvtx_annotate
def hash_object_cudf_index(ind, index=None):

if isinstance(ind, cudf.MultiIndex):
return safe_hash(ind.to_frame(index=False))
return ind.to_frame(index=False).hash_values()

col = cudf.core.column.as_column(ind)
return safe_hash(cudf.Series(col))
return cudf.Series(col).hash_values()


@group_split_dispatch.register((cudf.Series, cudf.DataFrame))
Expand Down

0 comments on commit 1304f94

Please sign in to comment.