Merge pull request #4000 from rjzamora/single-partition-sort

Minor dask_cudf sort_values optimization for single partitions
rapidsai · Jan 30, 2020 · 61c310b · 61c310b
2 parents 0b2fcb7 + cf9decc
commit 61c310b
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,7 @@
 - PR #3970 Fix for Series Pickle
 - PR #3964 Restore legacy NVStrings and NVCategory dependencies in Java jar
 - PR #3982 Fix java unary op enum and add missing ops
+- PR #4000 Fix dask_cudf sort_values performance for single partitions
 
 
 # cuDF 0.12.0 (Date TBD)

diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
@@ -209,11 +209,15 @@ def sort_values(self, by, ignore_index=False):
         ---------
         by : str
         """
-        parts = self.to_delayed()
-        sorted_parts = batcher_sortnet.sort_delayed_frame(parts, by)
-        return from_delayed(sorted_parts, meta=self._meta).reset_index(
-            force=not ignore_index
-        )
+        if self.npartitions == 1:
+            df = self.map_partitions(M.sort_values, by)
+        else:
+            parts = self.to_delayed()
+            sorted_parts = batcher_sortnet.sort_delayed_frame(parts, by)
+            df = from_delayed(sorted_parts, meta=self._meta)
+        if ignore_index:
+            return df.reset_index(drop=True)
+        return df
 
     def sort_values_binned(self, by):
         """Sorty by the given column and ensure that the same key

diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -8,7 +8,7 @@
 import cudf
 
 
-@pytest.mark.parametrize("by", ["a", "b"])
+@pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])
 @pytest.mark.parametrize("nelem", [10, 100, 1000])
 @pytest.mark.parametrize("nparts", [1, 2, 5, 10])
 def test_sort_values(nelem, nparts, by):
@@ -18,7 +18,9 @@ def test_sort_values(nelem, nparts, by):
     ddf = dd.from_pandas(df, npartitions=nparts)
 
     with dask.config.set(scheduler="single-threaded"):
-        got = ddf.sort_values(by=by).compute().to_pandas()
+        got = (
+            ddf.sort_values(by=by).compute().to_pandas().reset_index(drop=True)
+        )
     expect = df.sort_values(by=by).to_pandas().reset_index(drop=True)
     pd.util.testing.assert_frame_equal(got, expect)
 
@@ -51,3 +53,17 @@ def test_sort_binned_meta():
     ddf = dd.from_pandas(df, npartitions=2).persist()
 
     ddf.sort_values_binned(by="b")
+
+
+@pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])
+def test_sort_values_single_partition(by):
+    df = cudf.DataFrame()
+    nelem = 1000
+    df["a"] = np.ascontiguousarray(np.arange(nelem)[::-1])
+    df["b"] = np.arange(100, nelem + 100)
+    ddf = dd.from_pandas(df, npartitions=1)
+
+    with dask.config.set(scheduler="single-threaded"):
+        got = ddf.sort_values(by=by)
+    expect = df.sort_values(by=by)
+    dd.assert_eq(got, expect)