Skip to content

Commit

Permalink
Optimize set-like operations (#12769)
Browse files Browse the repository at this point in the history
Set-like operations such as `intersect_distinct` and `difference_distinct` call `purge_nonempty_nulls` when the input is nullable. This PR optimizes these set APIs by checking the existence of non-empty nulls (using `has_nonempty_nulls`) before calling to `purge_nonempty_nulls`.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Yunsong Wang (https://github.com/PointKernel)

URL: #12769
  • Loading branch information
ttnghia authored Apr 10, 2023
1 parent f357892 commit 30411b5
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 15 deletions.
4 changes: 4 additions & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,10 @@ ConfigureBench(SCATTER_BENCH copying/scatter.cu)
# * lists scatter benchmark -----------------------------------------------------------------------
ConfigureBench(SCATTER_LISTS_BENCH lists/copying/scatter_lists.cu)

# ##################################################################################################
# * Other list-related operartions benchmark ------------------------------------------------------
ConfigureNVBench(SET_OPS_NVBENCH lists/set_operations.cpp)

# ##################################################################################################
# * contiguous_split benchmark -------------------------------------------------------------------
ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split.cu)
Expand Down
84 changes: 84 additions & 0 deletions cpp/benchmarks/lists/set_operations.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/rmm_pool_raii.hpp>

#include <cudf/lists/set_operations.hpp>

#include <nvbench/nvbench.cuh>

namespace {

constexpr auto max_list_size = 20;

auto generate_random_lists(cudf::size_type num_rows, cudf::size_type depth, double null_freq)
{
auto builder =
data_profile_builder()
.cardinality(0)
.distribution(cudf::type_id::LIST, distribution_id::UNIFORM, 0, max_list_size)
.list_depth(depth)
.null_probability(null_freq > 0 ? std::optional<double>{null_freq} : std::nullopt);

auto data_table =
create_random_table({cudf::type_id::LIST}, row_count{num_rows}, data_profile{builder});
return std::move(data_table->release().front());
}

template <typename BenchFuncPtr>
void nvbench_set_op(nvbench::state& state, BenchFuncPtr bfunc)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const depth = static_cast<cudf::size_type>(state.get_int64("depth"));
auto const null_freq = state.get_float64("null_frequency");

auto const lhs = generate_random_lists(num_rows, depth, null_freq);
auto const rhs = generate_random_lists(num_rows, depth, null_freq);

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
bfunc(cudf::lists_column_view{*lhs},
cudf::lists_column_view{*rhs},
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL,
rmm::mr::get_current_device_resource());
});
}

} // namespace

void nvbench_have_overlap(nvbench::state& state)
{
nvbench_set_op(state, &cudf::lists::have_overlap);
}

void nvbench_intersect_distinct(nvbench::state& state)
{
nvbench_set_op(state, &cudf::lists::intersect_distinct);
}

NVBENCH_BENCH(nvbench_have_overlap)
.set_name("have_overlap")
.add_int64_power_of_two_axis("num_rows", {10, 13, 16})
.add_int64_axis("depth", {1, 4})
.add_float64_axis("null_frequency", {0, 0.2, 0.8});

NVBENCH_BENCH(nvbench_intersect_distinct)
.set_name("intersect_distinct")
.add_int64_power_of_two_axis("num_rows", {10, 13, 16})
.add_int64_axis("depth", {1, 4})
.add_float64_axis("null_frequency", {0, 0.2, 0.8});
13 changes: 9 additions & 4 deletions cpp/src/lists/set_operations.cu
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
list_indices.begin(),
result_begin);

// Reset null count, which was invalidated when calling to `mutable_view()`.
result->set_null_count(null_count);

return result;
Expand Down Expand Up @@ -181,8 +182,10 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
stream,
mr);

return null_count == 0 ? std::move(output)
: cudf::detail::purge_nonempty_nulls(output->view(), stream, mr);
if (auto const output_cv = output->view(); cudf::detail::has_nonempty_nulls(output_cv, stream)) {
return cudf::detail::purge_nonempty_nulls(output_cv, stream, mr);
}
return output;
}

std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
Expand Down Expand Up @@ -263,8 +266,10 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
stream,
mr);

return null_count == 0 ? std::move(output)
: cudf::detail::purge_nonempty_nulls(output->view(), stream, mr);
if (auto const output_cv = output->view(); cudf::detail::has_nonempty_nulls(output_cv, stream)) {
return cudf::detail::purge_nonempty_nulls(output_cv, stream, mr);
}
return output;
}

} // namespace detail
Expand Down
20 changes: 13 additions & 7 deletions cpp/tests/lists/set_operations/difference_distinct_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -23,7 +23,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/lists/set_operations.hpp>
#include <cudf/lists/sorting.hpp>
#include <cudf/lists/stream_compaction.hpp>
#include <cudf/null_mask.hpp>

#include <limits>
#include <string>
Expand Down Expand Up @@ -241,13 +241,19 @@ TEST_F(SetDifferenceTest, StringTestsWithNullsEqual)
strings_lists{}, /* NULL */
strings_lists{"aha", "this", "is another", "string???"}},
null_at(1)};
auto const expected = strings_lists{{strings_lists{"a", "is", "string", "this"},
strings_lists{} /*NULL*/,
strings_lists{"a", "is", "string"}},
null_at(1)};
auto const expected = [] {
auto str_lists = strings_lists{{strings_lists{"a", "is", "string", "this"},
strings_lists{} /*NULL*/,
strings_lists{"a", "is", "string"}},
null_at(1)}
.release();
auto& child = str_lists->child(cudf::lists_column_view::child_column_index);
child.set_null_mask(cudf::create_null_mask(child.size(), cudf::mask_state::ALL_VALID), 0);
return str_lists;
}();

auto const results_sorted = set_difference_sorted(lhs, rhs, NULL_EQUAL);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results_sorted);
}
}

Expand Down
16 changes: 12 additions & 4 deletions cpp/tests/lists/set_operations/intersect_distinct_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -24,6 +24,7 @@
#include <cudf/lists/set_operations.hpp>
#include <cudf/lists/sorting.hpp>
#include <cudf/lists/stream_compaction.hpp>
#include <cudf/null_mask.hpp>

#include <limits>
#include <string>
Expand Down Expand Up @@ -280,11 +281,18 @@ TEST_F(SetIntersectTest, StringTestsWithNullsUnequal)
strings_lists{}, /* NULL */
strings_lists{"aha", "this", "is another", "string???"}},
null_at(1)};
auto const expected =
strings_lists{{strings_lists{}, strings_lists{} /*NULL*/, strings_lists{"this"}}, null_at(1)};
auto const expected = [] {
auto str_lists =
strings_lists{{strings_lists{}, strings_lists{} /*NULL*/, strings_lists{"this"}},
null_at(1)}
.release();
auto& child = str_lists->child(cudf::lists_column_view::child_column_index);
child.set_null_mask(cudf::create_null_mask(child.size(), cudf::mask_state::ALL_VALID), 0);
return str_lists;
}();

auto const results_sorted = set_intersect_sorted(lhs, rhs, NULL_UNEQUAL);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results_sorted);
}
}

Expand Down

0 comments on commit 30411b5

Please sign in to comment.