From 41cf2daaa0af1a531ce96e438c1c7cf713617925 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 2 May 2023 13:56:26 +0100 Subject: [PATCH 1/5] Document restrictions on DataFrame.scatter_by_map Additionally, ensure that the partition_map has valid entries, otherwise one can end up with out of bounds memory reads in libcudf. Closes the part of #7513 related to invalid inputs. --- python/cudf/cudf/_lib/partitioning.pyx | 30 +++++++++++++++++++++++++- python/cudf/cudf/core/dataframe.py | 6 ++++++ python/cudf/cudf/tests/test_sorting.py | 6 ++++++ 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx index 083407954b3..627561b38b9 100644 --- a/python/cudf/cudf/_lib/partitioning.pyx +++ b/python/cudf/cudf/_lib/partitioning.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock @@ -13,6 +13,8 @@ from cudf._lib.cpp.partitioning cimport partition as cpp_partition from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns + +from cudf._lib.reduce import minmax from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count cimport cudf._lib.cpp.types as libcudf_types @@ -21,6 +23,29 @@ cimport cudf._lib.cpp.types as libcudf_types @acquire_spill_lock() def partition(list source_columns, Column partition_map, object num_partitions): + """Partition source columns given a partitioning map + + Parameters + ---------- + source_columns: list[Column] + Columns to partition + partition_map: Column + Column of integer values that map each row in the input to a + partition + num_partitions: Optional[int] + Number of output partitions (deduced from unique values in + partition_map if None) + + Returns + ------- + Pair of reordered columns and partition offsets + + Raises + ------ + ValueError + If the partition map has invalid entries (not all in [0, + num_partitions)). + """ if num_partitions is None: num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True) @@ -30,6 +55,9 @@ def partition(list source_columns, Column partition_map, cdef column_view c_partition_map_view = partition_map.view() cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result + lo, hi = minmax(partition_map) + if lo < 0 or hi >= num_partitions: + raise ValueError("Partition map has invalid values") with nogil: c_result = move( cpp_partition( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c6aba0d360a..d49f0678d3b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2254,6 +2254,12 @@ def scatter_by_map( Returns ------- A list of cudf.DataFrame objects. + + Raises + ------ + ValueError + If the map_index has invalid entries (not all in [0, + num_partitions)). """ # map_index might be a column name or array, # make it a Column diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index cd0c3bc8806..437cc7081a0 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -384,3 +384,9 @@ def test_dataframe_sort_values_kind(nelem, dtype, kind): assert_eq(sorted_df.index.values, sorted_index) assert_eq(sorted_df["a"].values, aa[sorted_index]) assert_eq(sorted_df["b"].values, bb[sorted_index]) + + +def test_dataframe_scatter_by_map_7513(): + df = DataFrame({"id": [1, 2, 1, 2], "val": [0, 1, 2, 3]}) + with pytest.raises(ValueError): + df.scatter_by_map(df["id"]) From 15cc99ada76710f14ea37c7bbc71d65a57337038 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 2 May 2023 15:52:58 +0100 Subject: [PATCH 2/5] Fix integer overflow in partition scatter_map construction Although the input partition map may be any valid integral type, the intermediate scatter map should have the same type as a valid row index (and the offsets histogram), namely `size_type`. Previously, the scatter map was created with the same integer type as the partition map, which can result in integer overflow, and incorrect results, when the partition map is a narrow integral type and the input table has more rows than the width of the type. Closes #7513. --- cpp/src/partitioning/partitioning.cu | 2 +- cpp/tests/partitioning/partition_test.cpp | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index 13f46195392..f85e8e59310 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -693,7 +693,7 @@ struct dispatch_map_type { // Unfortunately need to materialize the scatter map because // `detail::scatter` requires multiple passes through the iterator - rmm::device_uvector scatter_map(partition_map.size(), stream); + rmm::device_uvector scatter_map(partition_map.size(), stream); // For each `partition_map[i]`, atomically increment the corresponding // partition offset to determine `i`s location in the output diff --git a/cpp/tests/partitioning/partition_test.cpp b/cpp/tests/partitioning/partition_test.cpp index d356a1c405f..8ea224eb9fc 100644 --- a/cpp/tests/partitioning/partition_test.cpp +++ b/cpp/tests/partitioning/partition_test.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -328,3 +329,19 @@ TEST_F(PartitionTestNotTyped, ListOfListOfListOfIntEmpty) CUDF_TEST_EXPECT_TABLES_EQUAL(table_to_partition, result.first->view()); EXPECT_EQ(3, result.second.size()); } + +TEST_F(PartitionTestNotTyped, NoIntegerOverflow) +{ + auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); + fixed_width_column_wrapper map(elements, elements + 129); + auto table_to_partition = cudf::table_view{{map}}; + + std::vector expected_offsets{0, 65, 129}; + + auto expected_elements = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 65; }); + fixed_width_column_wrapper expected(expected_elements, expected_elements + 129); + auto expected_table = cudf::table_view{{expected}}; + + run_partition_test(table_to_partition, map, 2, expected_table, expected_offsets); +} From f4d0c103e3c29a4ffaaccb78aef7edfaf1dbfdb6 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 3 May 2023 09:53:07 +0100 Subject: [PATCH 3/5] Use scatter API rather than internal detail function --- cpp/src/partitioning/partitioning.cu | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index f85e8e59310..b0174d3bd83 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -16,9 +16,10 @@ #include #include +#include #include #include -#include +#include #include #include #include @@ -618,8 +619,7 @@ std::pair, std::vector> hash_partition_table( row_output_locations, num_rows, num_partitions, scanned_block_partition_sizes_ptr); // Use the resulting scatter map to materialize the output - auto output = detail::scatter( - input, row_partition_numbers.begin(), row_partition_numbers.end(), input, stream, mr); + auto output = detail::scatter(input, row_partition_numbers, input, stream, mr); stream.synchronize(); // Async D2H copy must finish before returning host vec return std::pair(std::move(output), std::move(partition_offsets)); @@ -706,8 +706,7 @@ struct dispatch_map_type { }); // Scatter the rows into their partitions - auto scattered = - cudf::detail::scatter(t, scatter_map.begin(), scatter_map.end(), t, stream, mr); + auto scattered = detail::scatter(t, scatter_map, t, stream, mr); return std::pair(std::move(scattered), std::move(partition_offsets)); } From 74f2d0fd904ec7ace889b280432ad42ab95a6075 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 4 May 2023 09:56:10 +0100 Subject: [PATCH 4/5] Only validate non-empty partition maps --- python/cudf/cudf/_lib/partitioning.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx index 627561b38b9..4bf8b32ea7e 100644 --- a/python/cudf/cudf/_lib/partitioning.pyx +++ b/python/cudf/cudf/_lib/partitioning.pyx @@ -55,9 +55,10 @@ def partition(list source_columns, Column partition_map, cdef column_view c_partition_map_view = partition_map.view() cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result - lo, hi = minmax(partition_map) - if lo < 0 or hi >= num_partitions: - raise ValueError("Partition map has invalid values") + if partition_map.size > 0: + lo, hi = minmax(partition_map) + if lo < 0 or hi >= num_partitions: + raise ValueError("Partition map has invalid values") with nogil: c_result = move( cpp_partition( From d7eec9452366f6ad70738a01e7c70bc3faee299a Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 4 May 2023 09:56:42 +0100 Subject: [PATCH 5/5] Test more exceptional cases in scatter_by_map --- python/cudf/cudf/tests/test_sorting.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 437cc7081a0..b3db1310adb 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -386,7 +386,14 @@ def test_dataframe_sort_values_kind(nelem, dtype, kind): assert_eq(sorted_df["b"].values, bb[sorted_index]) -def test_dataframe_scatter_by_map_7513(): - df = DataFrame({"id": [1, 2, 1, 2], "val": [0, 1, 2, 3]}) +@pytest.mark.parametrize("ids", [[-1, 0, 1, 0], [0, 2, 3, 0]]) +def test_dataframe_scatter_by_map_7513(ids): + df = DataFrame({"id": ids, "val": [0, 1, 2, 3]}) with pytest.raises(ValueError): df.scatter_by_map(df["id"]) + + +def test_dataframe_scatter_by_map_empty(): + df = DataFrame({"a": [], "b": []}) + scattered = df.scatter_by_map(df["a"]) + assert len(scattered) == 0