Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use optional-iterator for copy-if-else kernel #9324

Merged
merged 21 commits into from
Oct 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
e0a24ef
add benchmarks for copy-if-else and replace-nans
davidwendt Sep 28, 2021
8f5d225
split up copy_tests into cu and cpp
davidwendt Sep 28, 2021
45eff13
Use optional-iterator for copy-if-else kernel
davidwendt Sep 28, 2021
8244fe6
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Sep 28, 2021
9d56d0b
add missing include
davidwendt Sep 28, 2021
86e3a16
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Sep 29, 2021
efac52d
create multi-block test and remove detail_copy_tests.cu
davidwendt Sep 29, 2021
9ed9c13
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Sep 29, 2021
3b9f51f
remove valid var
davidwendt Sep 30, 2021
f0fed62
use operator* instead of value()
davidwendt Sep 30, 2021
7e01ae9
fix merge conflict
davidwendt Oct 1, 2021
a5f1704
undo indexalator commit
davidwendt Oct 1, 2021
7b41ad4
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Oct 4, 2021
9e25f50
fix merge conflict
davidwendt Oct 4, 2021
2fcc93c
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Oct 5, 2021
c028d93
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Oct 5, 2021
1716b70
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Oct 5, 2021
55963f7
merge strings copy_if_else changes
davidwendt Oct 6, 2021
37c5136
remove unneeded include
davidwendt Oct 11, 2021
881f9a2
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Oct 11, 2021
5fd512d
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Oct 12, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split_benchmark.cu)
# - shift benchmark -------------------------------------------------------------------------------
ConfigureBench(SHIFT_BENCH copying/shift_benchmark.cu)

###################################################################################################
# - copy-if-else benchmark -----------------------------------------------------------------------------
ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else_benchmark.cpp)

###################################################################################################
# - transpose benchmark ---------------------------------------------------------------------------
ConfigureBench(TRANSPOSE_BENCH transpose/transpose_benchmark.cu)
Expand Down Expand Up @@ -141,7 +145,8 @@ ConfigureBench(REDUCTION_BENCH
###################################################################################################
# - reduction benchmark ---------------------------------------------------------------------------
ConfigureBench(REPLACE_BENCH
replace/clamp_benchmark.cpp)
replace/clamp_benchmark.cpp
replace/nans_benchmark.cpp)

###################################################################################################
# - filling benchmark -----------------------------------------------------------------------------
Expand Down
67 changes: 67 additions & 0 deletions cpp/benchmarks/copying/copy_if_else_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/copying.hpp>

#include <rmm/device_buffer.hpp>

class CopyIfElse : public cudf::benchmark {
};

template <class TypeParam>
static void BM_copy_if_else(benchmark::State& state, bool nulls)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto input_type = cudf::type_to_id<TypeParam>();
auto bool_type = cudf::type_id::BOOL8;
auto const input = create_random_table({input_type, input_type, bool_type}, 3, row_count{n_rows});

if (!nulls) {
input->get_column(2).set_null_mask(rmm::device_buffer{}, 0);
input->get_column(1).set_null_mask(rmm::device_buffer{}, 0);
input->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
}

cudf::column_view decision(input->view().column(2));
cudf::column_view rhs(input->view().column(1));
cudf::column_view lhs(input->view().column(0));

for (auto _ : state) {
cuda_event_timer raii(state, true, rmm::cuda_stream_default);
cudf::copy_if_else(lhs, rhs, decision);
}
}

#define COPY_BENCHMARK_DEFINE(name, type, b) \
BENCHMARK_DEFINE_F(CopyIfElse, name) \
(::benchmark::State & st) { BM_copy_if_else<type>(st, b); } \
BENCHMARK_REGISTER_F(CopyIfElse, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 12, 1 << 27}}) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

COPY_BENCHMARK_DEFINE(int16, int16_t, true)
COPY_BENCHMARK_DEFINE(uint32, uint32_t, true)
COPY_BENCHMARK_DEFINE(float64, double, true)
COPY_BENCHMARK_DEFINE(int16_no_nulls, int16_t, false)
COPY_BENCHMARK_DEFINE(uint32_no_nulls, uint32_t, false)
COPY_BENCHMARK_DEFINE(float64_no_nulls, double, false)
20 changes: 10 additions & 10 deletions cpp/benchmarks/replace/clamp_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class ReplaceClamp : public cudf::benchmark {
};

template <typename type>
static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
static void BM_clamp(benchmark::State& state, bool include_nulls)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto const dtype = cudf::type_to_id<type>();
Expand Down Expand Up @@ -58,15 +58,15 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
}
}

#define CLAMP_BENCHMARK_DEFINE(name, type, nulls) \
BENCHMARK_DEFINE_F(ReplaceClamp, name) \
(::benchmark::State & state) { BM_reduction_scan<type>(state, nulls); } \
BENCHMARK_REGISTER_F(ReplaceClamp, name) \
->UseManualTime() \
->Arg(10000) /* 10k */ \
->Arg(100000) /* 100k */ \
->Arg(1000000) /* 1M */ \
->Arg(10000000) /* 10M */ \
#define CLAMP_BENCHMARK_DEFINE(name, type, nulls) \
BENCHMARK_DEFINE_F(ReplaceClamp, name) \
(::benchmark::State & state) { BM_clamp<type>(state, nulls); } \
BENCHMARK_REGISTER_F(ReplaceClamp, name) \
->UseManualTime() \
->Arg(10000) /* 10k */ \
->Arg(100000) /* 100k */ \
->Arg(1000000) /* 1M */ \
->Arg(10000000) /* 10M */ \
->Arg(100000000); /* 100M */

CLAMP_BENCHMARK_DEFINE(int8_no_nulls, int8_t, false);
Expand Down
63 changes: 63 additions & 0 deletions cpp/benchmarks/replace/nans_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/column/column.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/replace.hpp>
#include <cudf/scalar/scalar_factories.hpp>
#include <cudf/table/table.hpp>
#include <cudf/types.hpp>

class ReplaceNans : public cudf::benchmark {
};

template <typename type>
static void BM_replace_nans(benchmark::State& state, bool include_nulls)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto const dtype = cudf::type_to_id<type>();
auto const table = create_random_table({dtype}, 1, row_count{n_rows});
if (!include_nulls) { table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); }
cudf::column_view input(table->view().column(0));

auto zero = cudf::make_fixed_width_scalar<type>(0);

for (auto _ : state) {
cuda_event_timer timer(state, true);
auto result = cudf::replace_nans(input, *zero);
}
}

#define NANS_BENCHMARK_DEFINE(name, type, nulls) \
BENCHMARK_DEFINE_F(ReplaceNans, name) \
(::benchmark::State & state) { BM_replace_nans<type>(state, nulls); } \
BENCHMARK_REGISTER_F(ReplaceNans, name) \
->UseManualTime() \
->Arg(10000) /* 10k */ \
->Arg(100000) /* 100k */ \
->Arg(1000000) /* 1M */ \
->Arg(10000000) /* 10M */ \
->Arg(100000000); /* 100M */

NANS_BENCHMARK_DEFINE(float32_nulls, float, true);
NANS_BENCHMARK_DEFINE(float64_nulls, double, true);
NANS_BENCHMARK_DEFINE(float32_no_nulls, float, false);
NANS_BENCHMARK_DEFINE(float64_no_nulls, double, false);
34 changes: 9 additions & 25 deletions cpp/include/cudf/detail/copy_if_else.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,11 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/utilities/traits.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/device_scalar.hpp>

#include <cub/cub.cuh>

namespace cudf {
namespace detail {
namespace { // anonymous
Expand All @@ -40,7 +33,7 @@ template <size_type block_size,
typename LeftIter,
typename RightIter,
typename Filter,
bool has_validity>
bool has_nulls>
__launch_bounds__(block_size) __global__
void copy_if_else_kernel(LeftIter lhs,
RightIter rhs,
Expand Down Expand Up @@ -71,23 +64,14 @@ __launch_bounds__(block_size) __global__
size_type warp_cur = warp_begin + warp_id;
size_type index = tid;
while (warp_cur <= warp_end) {
bool in_range = (index >= begin && index < end);

bool valid = true;
if (has_validity) {
valid = in_range && (filter(index) ? thrust::get<1>(lhs[index]) : thrust::get<1>(rhs[index]));
}

// do the copy if-else
if (in_range) {
out.element<T>(index) = filter(index) ? static_cast<T>(thrust::get<0>(lhs[index]))
: static_cast<T>(thrust::get<0>(rhs[index]));
}
auto const opt_value =
(index < end) ? (filter(index) ? lhs[index] : rhs[index]) : thrust::nullopt;
if (not has_nulls or opt_value) { out.element<T>(index) = static_cast<T>(*opt_value); }

// update validity
if (has_validity) {
if (has_nulls) {
// the final validity mask for this warp
int warp_mask = __ballot_sync(0xFFFF'FFFF, valid && in_range);
int warp_mask = __ballot_sync(0xFFFF'FFFF, opt_value.has_value());
// only one guy in the warp needs to update the mask and count
if (lane_id == 0) {
out.set_mask_word(warp_cur, warp_mask);
Expand All @@ -100,7 +84,7 @@ __launch_bounds__(block_size) __global__
index += block_size * gridDim.x;
}

if (has_validity) {
if (has_nulls) {
// sum all null counts across all warps
size_type block_valid_count =
single_lane_block_sum_reduce<block_size, leader_lane>(warp_valid_count);
Expand Down Expand Up @@ -168,8 +152,8 @@ std::unique_ptr<column> copy_if_else(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
using Element =
typename thrust::tuple_element<0, typename thrust::iterator_traits<LeftIter>::value_type>::type;
// This is the type of the thrust::optional element in the passed iterators
using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;

size_type size = std::distance(lhs_begin, lhs_end);
size_type num_els = cudf::util::round_up_safe(size, warp_size);
Expand Down
Loading