Skip to content

Commit

Permalink
Make the shared memory intermediate storage a parameter for evaluate …
Browse files Browse the repository at this point in the history
…rather than the constructor and forward it along.
  • Loading branch information
vyasr committed Sep 9, 2021
1 parent e2ede0b commit 8bffd58
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 54 deletions.
103 changes: 56 additions & 47 deletions cpp/include/cudf/ast/detail/expression_evaluator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,8 @@ struct expression_evaluator {
*/
__device__ expression_evaluator(table_device_view const& left,
table_device_view const& right,
expression_device_view const& plan,
IntermediateDataType<has_nulls>* thread_intermediate_storage)
: left(left), right(right), plan(plan), thread_intermediate_storage(thread_intermediate_storage)
expression_device_view const& plan)
: left(left), right(right), plan(plan)
{
}

Expand All @@ -257,12 +256,8 @@ struct expression_evaluator {
* storing intermediates.
*/
__device__ expression_evaluator(table_device_view const& table,
expression_device_view const& plan,
IntermediateDataType<has_nulls>* thread_intermediate_storage)
: left(table),
right(table),
plan(plan),
thread_intermediate_storage(thread_intermediate_storage)
expression_device_view const& plan)
: left(table), right(table), plan(plan)
{
}

Expand All @@ -282,6 +277,7 @@ struct expression_evaluator {
template <typename Element, CUDF_ENABLE_IF(column_device_view::has_element_accessor<Element>())>
__device__ possibly_null_value_t<Element, has_nulls> resolve_input(
detail::device_data_reference const& input_reference,
IntermediateDataType<has_nulls>* thread_intermediate_storage,
cudf::size_type left_row_index,
thrust::optional<cudf::size_type> right_row_index = {}) const
{
Expand Down Expand Up @@ -331,6 +327,7 @@ struct expression_evaluator {
CUDF_ENABLE_IF(not column_device_view::has_element_accessor<Element>())>
__device__ possibly_null_value_t<Element, has_nulls> resolve_input(
detail::device_data_reference const& device_data_reference,
IntermediateDataType<has_nulls>* thread_intermediate_storage,
cudf::size_type left_row_index,
thrust::optional<cudf::size_type> right_row_index = {}) const
{
Expand Down Expand Up @@ -358,15 +355,18 @@ struct expression_evaluator {
detail::device_data_reference const& input,
detail::device_data_reference const& output,
cudf::size_type const output_row_index,
ast_operator const op) const
ast_operator const op,
IntermediateDataType<has_nulls>* thread_intermediate_storage) const
{
auto const typed_input = resolve_input<Input>(input, input_row_index);
auto const typed_input =
resolve_input<Input>(input, thread_intermediate_storage, input_row_index);
ast_operator_dispatcher(op,
unary_expression_output_handler<Input>(*this),
unary_expression_output_handler<Input>{},
output_object,
output_row_index,
typed_input,
output);
output,
thread_intermediate_storage);
}

/**
Expand All @@ -393,17 +393,21 @@ struct expression_evaluator {
detail::device_data_reference const& rhs,
detail::device_data_reference const& output,
cudf::size_type const output_row_index,
ast_operator const op) const
ast_operator const op,
IntermediateDataType<has_nulls>* thread_intermediate_storage) const
{
auto const typed_lhs = resolve_input<LHS>(lhs, left_row_index, right_row_index);
auto const typed_rhs = resolve_input<RHS>(rhs, left_row_index, right_row_index);
auto const typed_lhs =
resolve_input<LHS>(lhs, thread_intermediate_storage, left_row_index, right_row_index);
auto const typed_rhs =
resolve_input<RHS>(rhs, thread_intermediate_storage, left_row_index, right_row_index);
ast_operator_dispatcher(op,
binary_expression_output_handler<LHS, RHS>(*this),
binary_expression_output_handler<LHS, RHS>{},
output_object,
output_row_index,
typed_lhs,
typed_rhs,
output);
output,
thread_intermediate_storage);
}

/**
Expand All @@ -417,9 +421,11 @@ struct expression_evaluator {
* @param row_index Row index of all input and output data column(s).
*/
template <typename OutputType>
__device__ void evaluate(OutputType& output_object, cudf::size_type const row_index)
__device__ void evaluate(OutputType& output_object,
cudf::size_type const row_index,
IntermediateDataType<has_nulls>* thread_intermediate_storage)
{
evaluate(output_object, row_index, row_index, row_index);
evaluate(output_object, row_index, row_index, row_index, thread_intermediate_storage);
}

/**
Expand All @@ -438,17 +444,20 @@ struct expression_evaluator {
__device__ void evaluate(OutputType& output_object,
cudf::size_type const left_row_index,
cudf::size_type const right_row_index,
cudf::size_type const output_row_index)
cudf::size_type const output_row_index,
IntermediateDataType<has_nulls>* thread_intermediate_storage)
{
// TODO: Attempting to pass a reference to the device data reference is
// significantly faster for the non-null code path, but actually
// _degrades_ performance for the nullable code path. The reasons are
// likely due to subtle tradeoffs in what is getting put into registers
// when passing around raw values vs optionals, so for now I'm
// referencing shared mem directly for the non-null code path but copying
// locally for the nullable code path.
// locally for the nullable code path. I'll keep tweaking this since it's
// likely to be extremely sensitive to small changes in the kernel as I
// slowly manage to reduce local state and relieve register pressure.
using device_data_reference_t = std::conditional_t<has_nulls,
detail::device_data_reference const,
detail::device_data_reference const&,
detail::device_data_reference const&>;

cudf::size_type operator_source_index{0};
Expand All @@ -472,7 +481,8 @@ struct expression_evaluator {
input,
output,
output_row_index,
op);
op,
thread_intermediate_storage);
} else if (arity == 2) {
// Binary operator
device_data_reference_t lhs =
Expand All @@ -491,7 +501,8 @@ struct expression_evaluator {
rhs,
output,
output_row_index,
op);
op,
thread_intermediate_storage);
} else {
cudf_assert(false && "Invalid operator arity.");
}
Expand All @@ -509,10 +520,7 @@ struct expression_evaluator {
*/
struct expression_output_handler {
public:
__device__ expression_output_handler(expression_evaluator<has_nulls> const& evaluator)
: evaluator(evaluator)
{
}
__device__ expression_output_handler() {}

/**
* @brief Resolves an output data reference and assigns result value.
Expand All @@ -535,6 +543,7 @@ struct expression_evaluator {
__device__ void resolve_output(OutputType& output_object,
detail::device_data_reference const& device_data_reference,
cudf::size_type const row_index,
IntermediateDataType<has_nulls>* thread_intermediate_storage,
possibly_null_value_t<Element, has_nulls> const& result) const
{
if (device_data_reference.reference_type == detail::device_data_reference_type::COLUMN) {
Expand All @@ -544,7 +553,7 @@ struct expression_evaluator {
// Using a temporary variable ensures that the compiler knows the result is aligned.
IntermediateDataType<has_nulls> tmp;
memcpy(&tmp, &result, sizeof(possibly_null_value_t<Element, has_nulls>));
evaluator.thread_intermediate_storage[device_data_reference.data_index] = tmp;
thread_intermediate_storage[device_data_reference.data_index] = tmp;
}
}

Expand All @@ -554,13 +563,11 @@ struct expression_evaluator {
__device__ void resolve_output(OutputType& output_object,
detail::device_data_reference const& device_data_reference,
cudf::size_type const row_index,
IntermediateDataType<has_nulls>* thread_intermediate_storage,
possibly_null_value_t<Element, has_nulls> const& result) const
{
cudf_assert(false && "Invalid type in resolve_output.");
}

protected:
expression_evaluator<has_nulls> const& evaluator;
};

/**
Expand All @@ -571,10 +578,7 @@ struct expression_evaluator {
*/
template <typename Input>
struct unary_expression_output_handler : public expression_output_handler {
__device__ unary_expression_output_handler(expression_evaluator<has_nulls> const& evaluator)
: expression_output_handler(evaluator)
{
}
__device__ unary_expression_output_handler() {}

/**
* @brief Callable to perform a unary operation.
Expand All @@ -595,13 +599,17 @@ struct expression_evaluator {
__device__ void operator()(OutputType& output_object,
cudf::size_type const output_row_index,
possibly_null_value_t<Input, has_nulls> const& input,
detail::device_data_reference const& output) const
detail::device_data_reference const& output,
IntermediateDataType<has_nulls>* thread_intermediate_storage) const
{
// The output data type is the same whether or not nulls are present, so
// pull from the non-nullable operator.
using Out = cuda::std::invoke_result_t<detail::operator_functor<op, false>, Input>;
this->template resolve_output<Out>(
output_object, output, output_row_index, detail::operator_functor<op, has_nulls>{}(input));
this->template resolve_output<Out>(output_object,
output,
output_row_index,
thread_intermediate_storage,
detail::operator_functor<op, has_nulls>{}(input));
}

template <ast_operator op,
Expand All @@ -612,7 +620,8 @@ struct expression_evaluator {
__device__ void operator()(OutputType& output_object,
cudf::size_type const output_row_index,
possibly_null_value_t<Input, has_nulls> const& input,
detail::device_data_reference const& output) const
detail::device_data_reference const& output,
IntermediateDataType<has_nulls>* thread_intermediate_storage) const
{
cudf_assert(false && "Invalid unary dispatch operator for the provided input.");
}
Expand All @@ -626,10 +635,7 @@ struct expression_evaluator {
*/
template <typename LHS, typename RHS>
struct binary_expression_output_handler : public expression_output_handler {
__device__ binary_expression_output_handler(expression_evaluator<has_nulls> const& evaluator)
: expression_output_handler(evaluator)
{
}
__device__ binary_expression_output_handler() {}

/**
* @brief Callable to perform a binary operation.
Expand All @@ -653,14 +659,16 @@ struct expression_evaluator {
cudf::size_type const output_row_index,
possibly_null_value_t<LHS, has_nulls> const& lhs,
possibly_null_value_t<RHS, has_nulls> const& rhs,
detail::device_data_reference const& output) const
detail::device_data_reference const& output,
IntermediateDataType<has_nulls>* thread_intermediate_storage) const
{
// The output data type is the same whether or not nulls are present, so
// pull from the non-nullable operator.
using Out = cuda::std::invoke_result_t<detail::operator_functor<op, false>, LHS, RHS>;
this->template resolve_output<Out>(output_object,
output,
output_row_index,
thread_intermediate_storage,
detail::operator_functor<op, has_nulls>{}(lhs, rhs));
}

Expand All @@ -674,7 +682,8 @@ struct expression_evaluator {
cudf::size_type const output_row_index,
possibly_null_value_t<LHS, has_nulls> const& lhs,
possibly_null_value_t<RHS, has_nulls> const& rhs,
detail::device_data_reference const& output) const
detail::device_data_reference const& output,
IntermediateDataType<has_nulls>* thread_intermediate_storage) const
{
cudf_assert(false && "Invalid binary dispatch operator for the provided input.");
}
Expand Down
10 changes: 6 additions & 4 deletions cpp/src/join/conditional_join_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,15 @@ __global__ void compute_conditional_join_output_size(
cudf::size_type const right_num_rows = right_table.num_rows();

auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
left_table, right_table, device_expression_data, thread_intermediate_storage);
left_table, right_table, device_expression_data);

for (cudf::size_type left_row_index = left_start_idx; left_row_index < left_num_rows;
left_row_index += left_stride) {
bool found_match = false;
for (cudf::size_type right_row_index = 0; right_row_index < right_num_rows; right_row_index++) {
auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
evaluator.evaluate(output_dest, left_row_index, right_row_index, 0);
evaluator.evaluate(
output_dest, left_row_index, right_row_index, 0, thread_intermediate_storage);
if (output_dest.is_valid() && output_dest.value()) {
if ((join_type != join_kind::LEFT_ANTI_JOIN) &&
!(join_type == join_kind::LEFT_SEMI_JOIN && found_match)) {
Expand Down Expand Up @@ -161,13 +162,14 @@ __global__ void conditional_join(table_device_view left_table,
unsigned int const activemask = __ballot_sync(0xffffffff, left_row_index < left_num_rows);

auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
left_table, right_table, device_expression_data, thread_intermediate_storage);
left_table, right_table, device_expression_data);

if (left_row_index < left_num_rows) {
bool found_match = false;
for (size_type right_row_index(0); right_row_index < right_num_rows; ++right_row_index) {
auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
evaluator.evaluate(output_dest, left_row_index, right_row_index, 0);
evaluator.evaluate(
output_dest, left_row_index, right_row_index, 0, thread_intermediate_storage);

if (output_dest.is_valid() && output_dest.value()) {
// If the rows are equal, then we have found a true match
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/transform/compute_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,12 @@ __launch_bounds__(max_block_size) __global__
&intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
auto const start_idx = static_cast<cudf::size_type>(threadIdx.x + blockIdx.x * blockDim.x);
auto const stride = static_cast<cudf::size_type>(blockDim.x * gridDim.x);
auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
table, device_expression_data, thread_intermediate_storage);
auto evaluator =
cudf::ast::detail::expression_evaluator<has_nulls>(table, device_expression_data);

for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
auto output_dest = ast::detail::mutable_column_expression_result<has_nulls>(output_column);
evaluator.evaluate(output_dest, row_index);
evaluator.evaluate(output_dest, row_index, thread_intermediate_storage);
}
}

Expand Down

0 comments on commit 8bffd58

Please sign in to comment.