Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix cudf::partition* APIs that do not return offsets for empty output table #11709

Merged
merged 28 commits into from
Sep 27, 2022
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
d698a40
Fix implementation to return a vector of size 2
ttnghia Sep 14, 2022
97fc343
Fix/add tests
ttnghia Sep 14, 2022
605d52a
Merge branch 'branch-22.10' into fix_partitioning
ttnghia Sep 14, 2022
54e6d6f
Fix output offsets
ttnghia Sep 14, 2022
6190d31
Update copyright year
ttnghia Sep 14, 2022
ec3fb49
Fix tests
ttnghia Sep 14, 2022
c029822
Revere change and add comment
ttnghia Sep 14, 2022
9d022b8
Reverse changes
ttnghia Sep 14, 2022
733a5f7
Handle empty input again
ttnghia Sep 14, 2022
87f98f8
Fix copyright year again
ttnghia Sep 14, 2022
946c947
Merge branch 'branch-22.10' into fix_partitioning
ttnghia Sep 15, 2022
1f0df3e
Merge branch 'branch-22.10' into fix_partitioning
ttnghia Sep 19, 2022
d261307
Reverse all changes
ttnghia Sep 19, 2022
453faa8
Update docs
ttnghia Sep 19, 2022
5337e94
Reverse changes
ttnghia Sep 19, 2022
afba842
Revert "Reverse changes"
ttnghia Sep 23, 2022
0c8308e
Revert "Update docs"
ttnghia Sep 23, 2022
7e59807
Revert "Reverse all changes"
ttnghia Sep 23, 2022
dab6331
Merge branch 'branch-22.10' into fix_partitioning
ttnghia Sep 23, 2022
22345d5
Import `num_rows` for table
ttnghia Sep 24, 2022
c7f93a2
Output empty offsets if the output table is empty
ttnghia Sep 24, 2022
597d933
Simplify return value
ttnghia Sep 24, 2022
42a3d65
Fix copyright year
ttnghia Sep 24, 2022
0390d90
Reverse `partitioning.pyx`
ttnghia Sep 24, 2022
73d2d21
Handle empty frame
ttnghia Sep 24, 2022
1f1fc5a
Merge branch 'branch-22.10' into fix_partitioning
ttnghia Sep 25, 2022
453e69d
Merge branch 'branch-22.10' into fix_partitioning
ttnghia Sep 26, 2022
c9092ca
Update python/cudf/cudf/core/indexed_frame.py
ttnghia Sep 26, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions cpp/src/partitioning/partitioning.cu
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(

// Return empty result if there are no partitions or nothing to hash
if (num_partitions <= 0 || input.num_rows() == 0 || table_to_hash.num_columns() == 0) {
return std::pair(empty_like(input), std::vector<size_type>{});
return std::pair(empty_like(input), std::vector<size_type>(num_partitions, 0));
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
}

if (has_nulls(table_to_hash)) {
Expand All @@ -753,7 +753,8 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
CUDF_EXPECTS(not partition_map.has_nulls(), "Unexpected null values in partition_map.");

if (num_partitions == 0 or t.num_rows() == 0) {
return std::pair(empty_like(t), std::vector<size_type>{});
// The output offsets vector must have size `num_partitions + 1` as per documentation.
return std::pair(empty_like(t), std::vector<size_type>(num_partitions + 1, 0));
}

return cudf::type_dispatcher(
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/partitioning/round_robin.cu
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,10 @@ std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_part
"Incorrect start_partition index. Must be positive."); // since cudf::size_type is an alias for
// int32_t, it _can_ be negative

if (nrows == 0) {
return std::pair(empty_like(input), std::vector<size_type>(num_partitions, 0));
}

// handle degenerate case:
//
if (num_partitions >= nrows) {
Expand Down
10 changes: 5 additions & 5 deletions cpp/tests/partitioning/hash_partition_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ TEST_F(HashPartition, ZeroPartitions)
// Expect empty table with same number of columns and zero partitions
EXPECT_EQ(input.num_columns(), output->num_columns());
EXPECT_EQ(0, output->num_rows());
EXPECT_EQ(std::size_t{0}, offsets.size());
EXPECT_EQ(std::size_t{num_partitions}, offsets.size());
}

TEST_F(HashPartition, ZeroRows)
Expand All @@ -87,10 +87,10 @@ TEST_F(HashPartition, ZeroRows)
cudf::size_type const num_partitions = 3;
auto [output, offsets] = cudf::hash_partition(input, columns_to_hash, num_partitions);

// Expect empty table with same number of columns and zero partitions
// Expect empty table with same number of columns and same number of partitions
EXPECT_EQ(input.num_columns(), output->num_columns());
EXPECT_EQ(0, output->num_rows());
EXPECT_EQ(std::size_t{0}, offsets.size());
EXPECT_EQ(std::size_t{num_partitions}, offsets.size());
}

TEST_F(HashPartition, ZeroColumns)
Expand All @@ -102,10 +102,10 @@ TEST_F(HashPartition, ZeroColumns)
cudf::size_type const num_partitions = 3;
auto [output, offsets] = cudf::hash_partition(input, columns_to_hash, num_partitions);

// Expect empty table with same number of columns and zero partitions
// Expect empty table with same number of columns and same number of partitions
EXPECT_EQ(input.num_columns(), output->num_columns());
EXPECT_EQ(0, output->num_rows());
EXPECT_EQ(std::size_t{0}, offsets.size());
EXPECT_EQ(std::size_t{num_partitions}, offsets.size());
}

TEST_F(HashPartition, MixedColumnTypes)
Expand Down
2 changes: 1 addition & 1 deletion cpp/tests/partitioning/partition_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ TYPED_TEST(PartitionTest, EmptyInputs)

auto result_offsets = result.second;

EXPECT_TRUE(result_offsets.empty());
EXPECT_EQ(result_offsets.size(), std::size_t{11});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column, result.first->get_column(0));
}
Expand Down
14 changes: 13 additions & 1 deletion cpp/tests/partitioning/round_robin_test.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -47,6 +47,18 @@ class RoundRobinTest : public cudf::test::BaseFixture {

TYPED_TEST_SUITE(RoundRobinTest, cudf::test::FixedWidthTypes);

TYPED_TEST(RoundRobinTest, EmptyInput)
{
auto const empty_column = fixed_width_column_wrapper<TypeParam>{};
auto const num_partitions = 5;
auto const start_partition = 0;
auto const [out_table, out_offsets] =
cudf::round_robin_partition(cudf::table_view{{empty_column}}, num_partitions, start_partition);

EXPECT_EQ(out_table->num_rows(), 0);
EXPECT_EQ(out_offsets.size(), std::size_t{num_partitions});
}

TYPED_TEST(RoundRobinTest, RoundRobinPartitions13_3)
{
strings_column_wrapper rrColWrap1(
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/_lib/cpp/table/table.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.vector cimport vector
Expand All @@ -14,6 +14,7 @@ cdef extern from "cudf/table/table.hpp" namespace "cudf" nogil:
table(vector[unique_ptr[column]]&& columns) except +
table(table_view) except +
size_type num_columns() except +
size_type num_rows() except +
table_view view() except +
mutable_table_view mutable_view() except +
vector[unique_ptr[column]] release() except +
4 changes: 0 additions & 4 deletions python/cudf/cudf/_lib/hash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,9 @@ def hash_partition(list source_columns, object columns_to_hash,
)
)

# Note that the offsets (`c_result.second`) may be empty when
# the original table (`source_columns`) is empty. We need to
# return a list of zeros in this case.
return (
columns_from_unique_ptr(move(c_result.first)),
list(c_result.second)
if c_result.second.size() else [0] * num_partitions
)


Expand Down
3 changes: 3 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1621,6 +1621,9 @@ def _empty_like(self, keep_index=True):
)

def _split(self, splits, keep_index=True):
if len(self._index) == 0:
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
return []

columns_split = libcudf.copying.columns_split(
[
*(self._index._data.columns if keep_index else []),
Expand Down