From 41cf2daaa0af1a531ce96e438c1c7cf713617925 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 2 May 2023 13:56:26 +0100
Subject: [PATCH 1/5] Document restrictions on DataFrame.scatter_by_map

Additionally, ensure that the partition_map has valid entries,
otherwise one can end up with out of bounds memory reads in libcudf.

Closes the part of #7513 related to invalid inputs.
---
 python/cudf/cudf/_lib/partitioning.pyx | 30 +++++++++++++++++++++++++-
 python/cudf/cudf/core/dataframe.py     |  6 ++++++
 python/cudf/cudf/tests/test_sorting.py |  6 ++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx
index 083407954b3..627561b38b9 100644
--- a/python/cudf/cudf/_lib/partitioning.pyx
+++ b/python/cudf/cudf/_lib/partitioning.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -13,6 +13,8 @@ from cudf._lib.cpp.partitioning cimport partition as cpp_partition
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
+
+from cudf._lib.reduce import minmax
 from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 
 cimport cudf._lib.cpp.types as libcudf_types
@@ -21,6 +23,29 @@ cimport cudf._lib.cpp.types as libcudf_types
 @acquire_spill_lock()
 def partition(list source_columns, Column partition_map,
               object num_partitions):
+    """Partition source columns given a partitioning map
+
+    Parameters
+    ----------
+    source_columns: list[Column]
+        Columns to partition
+    partition_map: Column
+        Column of integer values that map each row in the input to a
+        partition
+    num_partitions: Optional[int]
+        Number of output partitions (deduced from unique values in
+        partition_map if None)
+
+    Returns
+    -------
+    Pair of reordered columns and partition offsets
+
+    Raises
+    ------
+    ValueError
+        If the partition map has invalid entries (not all in [0,
+        num_partitions)).
+    """
 
     if num_partitions is None:
         num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True)
@@ -30,6 +55,9 @@ def partition(list source_columns, Column partition_map,
     cdef column_view c_partition_map_view = partition_map.view()
 
     cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
+    lo, hi = minmax(partition_map)
+    if lo < 0 or hi >= num_partitions:
+        raise ValueError("Partition map has invalid values")
     with nogil:
         c_result = move(
             cpp_partition(
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index c6aba0d360a..d49f0678d3b 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2254,6 +2254,12 @@ def scatter_by_map(
         Returns
         -------
         A list of cudf.DataFrame objects.
+
+        Raises
+        ------
+        ValueError
+            If the map_index has invalid entries (not all in [0,
+            num_partitions)).
         """
         # map_index might be a column name or array,
         # make it a Column
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index cd0c3bc8806..437cc7081a0 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -384,3 +384,9 @@ def test_dataframe_sort_values_kind(nelem, dtype, kind):
     assert_eq(sorted_df.index.values, sorted_index)
     assert_eq(sorted_df["a"].values, aa[sorted_index])
     assert_eq(sorted_df["b"].values, bb[sorted_index])
+
+
+def test_dataframe_scatter_by_map_7513():
+    df = DataFrame({"id": [1, 2, 1, 2], "val": [0, 1, 2, 3]})
+    with pytest.raises(ValueError):
+        df.scatter_by_map(df["id"])

From 15cc99ada76710f14ea37c7bbc71d65a57337038 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 2 May 2023 15:52:58 +0100
Subject: [PATCH 2/5] Fix integer overflow in partition scatter_map
 construction

Although the input partition map may be any valid integral type, the
intermediate scatter map should have the same type as a valid row
index (and the offsets histogram), namely `size_type`. Previously, the
scatter map was created with the same integer type as the partition
map, which can result in integer overflow, and incorrect results, when
the partition map is a narrow integral type and the input table has
more rows than the width of the type.

Closes #7513.
---
 cpp/src/partitioning/partitioning.cu      |  2 +-
 cpp/tests/partitioning/partition_test.cpp | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 13f46195392..f85e8e59310 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -693,7 +693,7 @@ struct dispatch_map_type {
 
     // Unfortunately need to materialize the scatter map because
     // `detail::scatter` requires multiple passes through the iterator
-    rmm::device_uvector<MapType> scatter_map(partition_map.size(), stream);
+    rmm::device_uvector<size_type> scatter_map(partition_map.size(), stream);
 
     // For each `partition_map[i]`, atomically increment the corresponding
     // partition offset to determine `i`s location in the output
diff --git a/cpp/tests/partitioning/partition_test.cpp b/cpp/tests/partitioning/partition_test.cpp
index d356a1c405f..8ea224eb9fc 100644
--- a/cpp/tests/partitioning/partition_test.cpp
+++ b/cpp/tests/partitioning/partition_test.cpp
@@ -20,6 +20,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/partitioning.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
@@ -328,3 +329,19 @@ TEST_F(PartitionTestNotTyped, ListOfListOfListOfIntEmpty)
   CUDF_TEST_EXPECT_TABLES_EQUAL(table_to_partition, result.first->view());
   EXPECT_EQ(3, result.second.size());
 }
+
+TEST_F(PartitionTestNotTyped, NoIntegerOverflow)
+{
+  auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
+  fixed_width_column_wrapper<int8_t> map(elements, elements + 129);
+  auto table_to_partition = cudf::table_view{{map}};
+
+  std::vector<cudf::size_type> expected_offsets{0, 65, 129};
+
+  auto expected_elements =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 65; });
+  fixed_width_column_wrapper<int8_t> expected(expected_elements, expected_elements + 129);
+  auto expected_table = cudf::table_view{{expected}};
+
+  run_partition_test(table_to_partition, map, 2, expected_table, expected_offsets);
+}

From f4d0c103e3c29a4ffaaccb78aef7edfaf1dbfdb6 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 3 May 2023 09:53:07 +0100
Subject: [PATCH 3/5] Use scatter API rather than internal detail function

---
 cpp/src/partitioning/partitioning.cu | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index f85e8e59310..b0174d3bd83 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -16,9 +16,10 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/gather.cuh>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/scatter.cuh>
+#include <cudf/detail/scatter.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -618,8 +619,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
       row_output_locations, num_rows, num_partitions, scanned_block_partition_sizes_ptr);
 
     // Use the resulting scatter map to materialize the output
-    auto output = detail::scatter(
-      input, row_partition_numbers.begin(), row_partition_numbers.end(), input, stream, mr);
+    auto output = detail::scatter(input, row_partition_numbers, input, stream, mr);
 
     stream.synchronize();  // Async D2H copy must finish before returning host vec
     return std::pair(std::move(output), std::move(partition_offsets));
@@ -706,8 +706,7 @@ struct dispatch_map_type {
                       });
 
     // Scatter the rows into their partitions
-    auto scattered =
-      cudf::detail::scatter(t, scatter_map.begin(), scatter_map.end(), t, stream, mr);
+    auto scattered = detail::scatter(t, scatter_map, t, stream, mr);
 
     return std::pair(std::move(scattered), std::move(partition_offsets));
   }

From 74f2d0fd904ec7ace889b280432ad42ab95a6075 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 4 May 2023 09:56:10 +0100
Subject: [PATCH 4/5] Only validate non-empty partition maps

---
 python/cudf/cudf/_lib/partitioning.pyx | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx
index 627561b38b9..4bf8b32ea7e 100644
--- a/python/cudf/cudf/_lib/partitioning.pyx
+++ b/python/cudf/cudf/_lib/partitioning.pyx
@@ -55,9 +55,10 @@ def partition(list source_columns, Column partition_map,
     cdef column_view c_partition_map_view = partition_map.view()
 
     cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
-    lo, hi = minmax(partition_map)
-    if lo < 0 or hi >= num_partitions:
-        raise ValueError("Partition map has invalid values")
+    if partition_map.size > 0:
+        lo, hi = minmax(partition_map)
+        if lo < 0 or hi >= num_partitions:
+            raise ValueError("Partition map has invalid values")
     with nogil:
         c_result = move(
             cpp_partition(

From d7eec9452366f6ad70738a01e7c70bc3faee299a Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 4 May 2023 09:56:42 +0100
Subject: [PATCH 5/5] Test more exceptional cases in scatter_by_map

---
 python/cudf/cudf/tests/test_sorting.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index 437cc7081a0..b3db1310adb 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -386,7 +386,14 @@ def test_dataframe_sort_values_kind(nelem, dtype, kind):
     assert_eq(sorted_df["b"].values, bb[sorted_index])
 
 
-def test_dataframe_scatter_by_map_7513():
-    df = DataFrame({"id": [1, 2, 1, 2], "val": [0, 1, 2, 3]})
+@pytest.mark.parametrize("ids", [[-1, 0, 1, 0], [0, 2, 3, 0]])
+def test_dataframe_scatter_by_map_7513(ids):
+    df = DataFrame({"id": ids, "val": [0, 1, 2, 3]})
     with pytest.raises(ValueError):
         df.scatter_by_map(df["id"])
+
+
+def test_dataframe_scatter_by_map_empty():
+    df = DataFrame({"a": [], "b": []})
+    scattered = df.scatter_by_map(df["a"])
+    assert len(scattered) == 0