Skip to content

Commit

Permalink
Merge pull request #4000 from rjzamora/single-partition-sort
Browse files Browse the repository at this point in the history
Minor dask_cudf sort_values optimization for single partitions
  • Loading branch information
kkraus14 authored Jan 30, 2020
2 parents 0b2fcb7 + cf9decc commit 61c310b
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
- PR #3970 Fix for Series Pickle
- PR #3964 Restore legacy NVStrings and NVCategory dependencies in Java jar
- PR #3982 Fix java unary op enum and add missing ops
- PR #4000 Fix dask_cudf sort_values performance for single partitions


# cuDF 0.12.0 (Date TBD)
Expand Down
14 changes: 9 additions & 5 deletions python/dask_cudf/dask_cudf/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,11 +209,15 @@ def sort_values(self, by, ignore_index=False):
---------
by : str
"""
parts = self.to_delayed()
sorted_parts = batcher_sortnet.sort_delayed_frame(parts, by)
return from_delayed(sorted_parts, meta=self._meta).reset_index(
force=not ignore_index
)
if self.npartitions == 1:
df = self.map_partitions(M.sort_values, by)
else:
parts = self.to_delayed()
sorted_parts = batcher_sortnet.sort_delayed_frame(parts, by)
df = from_delayed(sorted_parts, meta=self._meta)
if ignore_index:
return df.reset_index(drop=True)
return df

def sort_values_binned(self, by):
"""Sorty by the given column and ensure that the same key
Expand Down
20 changes: 18 additions & 2 deletions python/dask_cudf/dask_cudf/tests/test_sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import cudf


@pytest.mark.parametrize("by", ["a", "b"])
@pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])
@pytest.mark.parametrize("nelem", [10, 100, 1000])
@pytest.mark.parametrize("nparts", [1, 2, 5, 10])
def test_sort_values(nelem, nparts, by):
Expand All @@ -18,7 +18,9 @@ def test_sort_values(nelem, nparts, by):
ddf = dd.from_pandas(df, npartitions=nparts)

with dask.config.set(scheduler="single-threaded"):
got = ddf.sort_values(by=by).compute().to_pandas()
got = (
ddf.sort_values(by=by).compute().to_pandas().reset_index(drop=True)
)
expect = df.sort_values(by=by).to_pandas().reset_index(drop=True)
pd.util.testing.assert_frame_equal(got, expect)

Expand Down Expand Up @@ -51,3 +53,17 @@ def test_sort_binned_meta():
ddf = dd.from_pandas(df, npartitions=2).persist()

ddf.sort_values_binned(by="b")


@pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])
def test_sort_values_single_partition(by):
df = cudf.DataFrame()
nelem = 1000
df["a"] = np.ascontiguousarray(np.arange(nelem)[::-1])
df["b"] = np.arange(100, nelem + 100)
ddf = dd.from_pandas(df, npartitions=1)

with dask.config.set(scheduler="single-threaded"):
got = ddf.sort_values(by=by)
expect = df.sort_values(by=by)
dd.assert_eq(got, expect)

0 comments on commit 61c310b

Please sign in to comment.