Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use optional-iterator for copy-if-else kernel #9324

Merged
merged 21 commits into from
Oct 13, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
e0a24ef
add benchmarks for copy-if-else and replace-nans
davidwendt Sep 28, 2021
8f5d225
split up copy_tests into cu and cpp
davidwendt Sep 28, 2021
45eff13
Use optional-iterator for copy-if-else kernel
davidwendt Sep 28, 2021
8244fe6
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Sep 28, 2021
9d56d0b
add missing include
davidwendt Sep 28, 2021
86e3a16
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Sep 29, 2021
efac52d
create multi-block test and remove detail_copy_tests.cu
davidwendt Sep 29, 2021
9ed9c13
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Sep 29, 2021
3b9f51f
remove valid var
davidwendt Sep 30, 2021
f0fed62
use operator* instead of value()
davidwendt Sep 30, 2021
7e01ae9
fix merge conflict
davidwendt Oct 1, 2021
a5f1704
undo indexalator commit
davidwendt Oct 1, 2021
7b41ad4
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Oct 4, 2021
9e25f50
fix merge conflict
davidwendt Oct 4, 2021
2fcc93c
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Oct 5, 2021
c028d93
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Oct 5, 2021
1716b70
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Oct 5, 2021
55963f7
merge strings copy_if_else changes
davidwendt Oct 6, 2021
37c5136
remove unneeded include
davidwendt Oct 11, 2021
881f9a2
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Oct 11, 2021
5fd512d
Merge branch 'branch-21.12' into copy-cu-compile-time
davidwendt Oct 12, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split_benchmark.cu)
# - shift benchmark -------------------------------------------------------------------------------
ConfigureBench(SHIFT_BENCH copying/shift_benchmark.cu)

###################################################################################################
# - copy-if-else benchmark -----------------------------------------------------------------------------
ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else_benchmark.cpp)

###################################################################################################
# - transpose benchmark ---------------------------------------------------------------------------
ConfigureBench(TRANSPOSE_BENCH transpose/transpose_benchmark.cu)
Expand Down Expand Up @@ -141,7 +145,8 @@ ConfigureBench(REDUCTION_BENCH
###################################################################################################
# - reduction benchmark ---------------------------------------------------------------------------
ConfigureBench(REPLACE_BENCH
replace/clamp_benchmark.cpp)
replace/clamp_benchmark.cpp
replace/nans_benchmark.cpp)

###################################################################################################
# - filling benchmark -----------------------------------------------------------------------------
Expand Down
69 changes: 69 additions & 0 deletions cpp/benchmarks/copying/copy_if_else_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/copying.hpp>

#include <rmm/device_buffer.hpp>

// #include <string>
davidwendt marked this conversation as resolved.
Show resolved Hide resolved

class CopyIfElse : public cudf::benchmark {
};

template <class TypeParam>
static void BM_copy_if_else(benchmark::State& state, bool nulls)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto input_type = cudf::type_to_id<TypeParam>();
auto bool_type = cudf::type_id::BOOL8;
auto const input = create_random_table({input_type, input_type, bool_type}, 3, row_count{n_rows});

if (!nulls) {
input->get_column(2).set_null_mask(rmm::device_buffer{}, 0);
input->get_column(1).set_null_mask(rmm::device_buffer{}, 0);
input->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
}

cudf::column_view decision(input->view().column(2));
cudf::column_view rhs(input->view().column(1));
cudf::column_view lhs(input->view().column(0));

for (auto _ : state) {
cuda_event_timer raii(state, true, rmm::cuda_stream_default);
cudf::copy_if_else(lhs, rhs, decision);
}
}

#define COPY_BENCHMARK_DEFINE(name, type, b) \
BENCHMARK_DEFINE_F(CopyIfElse, name) \
(::benchmark::State & st) { BM_copy_if_else<type>(st, b); } \
BENCHMARK_REGISTER_F(CopyIfElse, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 12, 1 << 27}}) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

COPY_BENCHMARK_DEFINE(int16, int16_t, true)
COPY_BENCHMARK_DEFINE(uint32, uint32_t, true)
COPY_BENCHMARK_DEFINE(float64, double, true)
COPY_BENCHMARK_DEFINE(int16_no_nulls, int16_t, false)
COPY_BENCHMARK_DEFINE(uint32_no_nulls, uint32_t, false)
COPY_BENCHMARK_DEFINE(float64_no_nulls, double, false)
20 changes: 10 additions & 10 deletions cpp/benchmarks/replace/clamp_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class ReplaceClamp : public cudf::benchmark {
};

template <typename type>
static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
static void BM_clamp(benchmark::State& state, bool include_nulls)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto const dtype = cudf::type_to_id<type>();
Expand Down Expand Up @@ -58,15 +58,15 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
}
}

#define CLAMP_BENCHMARK_DEFINE(name, type, nulls) \
BENCHMARK_DEFINE_F(ReplaceClamp, name) \
(::benchmark::State & state) { BM_reduction_scan<type>(state, nulls); } \
BENCHMARK_REGISTER_F(ReplaceClamp, name) \
->UseManualTime() \
->Arg(10000) /* 10k */ \
->Arg(100000) /* 100k */ \
->Arg(1000000) /* 1M */ \
->Arg(10000000) /* 10M */ \
#define CLAMP_BENCHMARK_DEFINE(name, type, nulls) \
BENCHMARK_DEFINE_F(ReplaceClamp, name) \
(::benchmark::State & state) { BM_clamp<type>(state, nulls); } \
BENCHMARK_REGISTER_F(ReplaceClamp, name) \
->UseManualTime() \
->Arg(10000) /* 10k */ \
->Arg(100000) /* 100k */ \
->Arg(1000000) /* 1M */ \
->Arg(10000000) /* 10M */ \
->Arg(100000000); /* 100M */

CLAMP_BENCHMARK_DEFINE(int8_no_nulls, int8_t, false);
Expand Down
63 changes: 63 additions & 0 deletions cpp/benchmarks/replace/nans_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/column/column.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/replace.hpp>
#include <cudf/scalar/scalar_factories.hpp>
#include <cudf/table/table.hpp>
#include <cudf/types.hpp>

class ReplaceNans : public cudf::benchmark {
};

template <typename type>
static void BM_replace_nans(benchmark::State& state, bool include_nulls)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto const dtype = cudf::type_to_id<type>();
auto const table = create_random_table({dtype}, 1, row_count{n_rows});
if (!include_nulls) { table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); }
cudf::column_view input(table->view().column(0));

auto zero = cudf::make_fixed_width_scalar<type>(0);

for (auto _ : state) {
cuda_event_timer timer(state, true);
auto result = cudf::replace_nans(input, *zero);
}
}

#define NANS_BENCHMARK_DEFINE(name, type, nulls) \
BENCHMARK_DEFINE_F(ReplaceNans, name) \
(::benchmark::State & state) { BM_replace_nans<type>(state, nulls); } \
BENCHMARK_REGISTER_F(ReplaceNans, name) \
->UseManualTime() \
->Arg(10000) /* 10k */ \
->Arg(100000) /* 100k */ \
->Arg(1000000) /* 1M */ \
->Arg(10000000) /* 10M */ \
->Arg(100000000); /* 100M */

NANS_BENCHMARK_DEFINE(float32_nulls, float, true);
NANS_BENCHMARK_DEFINE(float64_nulls, double, true);
NANS_BENCHMARK_DEFINE(float32_no_nulls, float, false);
NANS_BENCHMARK_DEFINE(float64_no_nulls, double, false);
31 changes: 9 additions & 22 deletions cpp/include/cudf/detail/copy_if_else.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,12 @@

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_view.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/strings/detail/copy_if_else.cuh>
#include <cudf/utilities/traits.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/device_scalar.hpp>

#include <cub/cub.cuh>

namespace cudf {
namespace detail {
namespace { // anonymous
Expand Down Expand Up @@ -71,23 +64,17 @@ __launch_bounds__(block_size) __global__
size_type warp_cur = warp_begin + warp_id;
size_type index = tid;
while (warp_cur <= warp_end) {
bool in_range = (index >= begin && index < end);

bool valid = true;
if (has_validity) {
valid = in_range && (filter(index) ? thrust::get<1>(lhs[index]) : thrust::get<1>(rhs[index]));
}

// do the copy if-else
if (in_range) {
out.element<T>(index) = filter(index) ? static_cast<T>(thrust::get<0>(lhs[index]))
: static_cast<T>(thrust::get<0>(rhs[index]));
bool valid = false;
if (index >= begin && index < end) {
auto value = filter(index) ? lhs[index] : rhs[index];
valid = !has_validity || value.has_value();
if (valid) { out.element<T>(index) = static_cast<T>(value.value()); }
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
}

// update validity
if (has_validity) {
// the final validity mask for this warp
int warp_mask = __ballot_sync(0xFFFF'FFFF, valid && in_range);
int warp_mask = __ballot_sync(0xFFFF'FFFF, valid);
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
// only one guy in the warp needs to update the mask and count
if (lane_id == 0) {
out.set_mask_word(warp_cur, warp_mask);
Expand Down Expand Up @@ -168,8 +155,8 @@ std::unique_ptr<column> copy_if_else(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
using Element =
typename thrust::tuple_element<0, typename thrust::iterator_traits<LeftIter>::value_type>::type;
// This is the type of the thrust::optional element in the passed iterators
using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;

size_type size = std::distance(lhs_begin, lhs_end);
size_type num_els = cudf::util::round_up_safe(size, warp_size);
Expand Down
91 changes: 87 additions & 4 deletions cpp/include/cudf/detail/indexalator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ struct indexalator_factory {
/**
* @brief An index accessor that returns a validity flag along with the index value.
*
* This is suitable as a `pair_iterator` for calling functions like `copy_if_else`.
* This is suitable as a `pair_iterator`.
*/
struct nullable_index_accessor {
input_indexalator iter;
Expand All @@ -502,17 +502,32 @@ struct indexalator_factory {
iter = make_input_iterator(col);
}

__device__ thrust::pair<size_type, bool> operator()(size_type i) const
{
return {iter[i], (has_nulls ? bit_is_set(null_mask, i + offset) : true)};
}
};

/**
* @brief An index accessor that returns a validity flag along with the index value.
*
* This is suitable as a `pair_iterator`.
*/
struct scalar_nullable_index_accessor {
input_indexalator iter;
bool const is_null;

/**
* @brief Create an accessor from a scalar.
*/
nullable_index_accessor(scalar const& input) : has_nulls{!input.is_valid()}
scalar_nullable_index_accessor(scalar const& input) : is_null{!input.is_valid()}
{
iter = indexalator_factory::make_input_iterator(input);
}

__device__ thrust::pair<size_type, bool> operator()(size_type i) const
{
return {iter[i], (has_nulls ? bit_is_set(null_mask, i + offset) : true)};
return {*iter, is_null};
}
};

Expand All @@ -530,7 +545,75 @@ struct indexalator_factory {
static auto make_input_pair_iterator(scalar const& input)
{
return thrust::make_transform_iterator(thrust::make_constant_iterator<size_type>(0),
nullable_index_accessor{input});
scalar_nullable_index_accessor{input});
}

/**
* @brief An index accessor that returns an index value if corresponding validity flag is true.
*
* This is suitable as an `optional_iterator`.
*/
struct optional_index_accessor {
input_indexalator iter;
bitmask_type const* null_mask{};
size_type const offset{};
bool const has_nulls{};

/**
* @brief Create an accessor from a column_view.
*/
optional_index_accessor(column_view const& col, bool has_nulls = false)
: null_mask{col.null_mask()}, offset{col.offset()}, has_nulls{has_nulls}
{
if (has_nulls) { CUDF_EXPECTS(col.nullable(), "Unexpected non-nullable column."); }
iter = make_input_iterator(col);
}

__device__ thrust::optional<size_type> operator()(size_type i) const
{
return has_nulls && !bit_is_set(null_mask, i + offset) ? thrust::nullopt
: thrust::make_optional(iter[i]);
}
};

/**
* @brief An index accessor that returns an index value if corresponding validity flag is true.
*
* This is suitable as an `optional_iterator`.
*/
struct scalar_optional_index_accessor {
input_indexalator iter;
bool const is_null;

/**
* @brief Create an accessor from a scalar.
*/
scalar_optional_index_accessor(scalar const& input) : is_null{!input.is_valid()}
{
iter = indexalator_factory::make_input_iterator(input);
}

__device__ thrust::optional<size_type> operator()(size_type i) const
{
return is_null ? thrust::nullopt : thrust::make_optional(*iter);
}
};

/**
* @brief Create an index iterator with a nullable index accessor.
*/
static auto make_input_optional_iterator(column_view const& col)
{
return make_counting_transform_iterator(0, optional_index_accessor{col, col.has_nulls()});
}

/**
* @brief Create an index iterator with a nullable index accessor for a scalar.
*/
static auto make_input_optional_iterator(scalar const& input)
{
return thrust::make_transform_iterator(thrust::make_constant_iterator<size_type>(0),
scalar_optional_index_accessor{input});
}
};

Expand Down
Loading