Skip to content

Commit

Permalink
Enable AST-based joining (#8214)
Browse files Browse the repository at this point in the history
This PR implements conditional joins using expressions that are decomposed into abstract syntax trees for evaluation. This PR builds on the AST evaluation framework established in #5494 and #7418, but significantly refactors the internals and generalizes them to enable 1) expressions on two tables and 2) operations on nullable columns. This PR uses the nested loop join code created in #5397 for inner joins, but also substantially generalizes that code to enable 1) all types of joins, 2) joins with arbitrary AST expressions rather than just equality, and 3) handling of null values (with user-specified `null_equality`). A significant chunk of the code is currently out of place, but since this changeset is rather large I've opted not to move things in ways that will make reviewing this PR significantly more challenging. I will make a follow-up to address those issues once this PR is merged.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Conor Hoekstra (https://github.com/codereport)

URL: #8214
  • Loading branch information
vyasr authored Jul 14, 2021
1 parent d05de97 commit 397bf0a
Show file tree
Hide file tree
Showing 16 changed files with 2,488 additions and 496 deletions.
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchma

###################################################################################################
# - join benchmark --------------------------------------------------------------------------------
ConfigureBench(JOIN_BENCH join/join_benchmark.cu)
ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu)

###################################################################################################
# - iterator benchmark ----------------------------------------------------------------------------
Expand Down
76 changes: 53 additions & 23 deletions cpp/benchmarks/ast/transform_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,21 @@
#include <thrust/iterator/counting_iterator.h>

#include <algorithm>
#include <iostream>
#include <list>
#include <numeric>
#include <random>
#include <vector>

enum class TreeType {
IMBALANCED_LEFT // All operator expressions have a left child operator expression and a right
// child column reference
};

template <typename key_type, TreeType tree_type, bool reuse_columns>
template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
class AST : public cudf::benchmark {
};

template <typename key_type, TreeType tree_type, bool reuse_columns>
template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
static void BM_ast_transform(benchmark::State& state)
{
const cudf::size_type table_size{(cudf::size_type)state.range(0)};
Expand All @@ -56,10 +56,24 @@ static void BM_ast_transform(benchmark::State& state)
auto columns = std::vector<cudf::column_view>(n_cols);

auto data_iterator = thrust::make_counting_iterator(0);
std::generate_n(column_wrappers.begin(), n_cols, [=]() {
return cudf::test::fixed_width_column_wrapper<key_type>(data_iterator,
data_iterator + table_size);
});

if constexpr (Nullable) {
auto validities = std::vector<bool>(table_size);
std::random_device rd;
std::mt19937 gen(rd());

std::generate(
validities.begin(), validities.end(), [&]() { return gen() > (0.5 * gen.max()); });
std::generate_n(column_wrappers.begin(), n_cols, [=]() {
return cudf::test::fixed_width_column_wrapper<key_type>(
data_iterator, data_iterator + table_size, validities.begin());
});
} else {
std::generate_n(column_wrappers.begin(), n_cols, [=]() {
return cudf::test::fixed_width_column_wrapper<key_type>(data_iterator,
data_iterator + table_size);
});
}
std::transform(
column_wrappers.begin(), column_wrappers.end(), columns.begin(), [](auto const& col) {
return static_cast<cudf::column_view>(col);
Expand Down Expand Up @@ -113,22 +127,23 @@ static void BM_ast_transform(benchmark::State& state)
(tree_levels + 1) * sizeof(key_type));
}

#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \
BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns) \
(::benchmark::State & st) { BM_ast_transform<key_type, tree_type, reuse_columns>(st); }

AST_TRANSFORM_BENCHMARK_DEFINE(ast_int32_imbalanced_unique,
int32_t,
TreeType::IMBALANCED_LEFT,
false);
AST_TRANSFORM_BENCHMARK_DEFINE(ast_int32_imbalanced_reuse,
int32_t,
TreeType::IMBALANCED_LEFT,
true);
AST_TRANSFORM_BENCHMARK_DEFINE(ast_double_imbalanced_unique,
double,
TreeType::IMBALANCED_LEFT,
false);
#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \
(::benchmark::State & st) { BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st); }

AST_TRANSFORM_BENCHMARK_DEFINE(
ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);
AST_TRANSFORM_BENCHMARK_DEFINE(
ast_int32_imbalanced_reuse, int32_t, TreeType::IMBALANCED_LEFT, true, false);
AST_TRANSFORM_BENCHMARK_DEFINE(
ast_double_imbalanced_unique, double, TreeType::IMBALANCED_LEFT, false, false);

AST_TRANSFORM_BENCHMARK_DEFINE(
ast_int32_imbalanced_unique_nulls, int32_t, TreeType::IMBALANCED_LEFT, false, true);
AST_TRANSFORM_BENCHMARK_DEFINE(
ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true);
AST_TRANSFORM_BENCHMARK_DEFINE(
ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true);

static void CustomRanges(benchmark::internal::Benchmark* b)
{
Expand All @@ -155,3 +170,18 @@ BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_unique_nulls)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_reuse_nulls)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique_nulls)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();
Loading

0 comments on commit 397bf0a

Please sign in to comment.