From c2cec0592be665ed1abd863035a3cc29d20bafb9 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Mon, 22 Feb 2021 08:15:10 -0500 Subject: [PATCH 01/18] Large unit tests --- cpp/tests/ast/transform_tests.cpp | 66 +++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp index 8f4a46e2a54..74937d4deea 100644 --- a/cpp/tests/ast/transform_tests.cpp +++ b/cpp/tests/ast/transform_tests.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,8 @@ #include #include +#include + #include #include @@ -55,6 +58,22 @@ TEST_F(TransformTest, BasicAddition) cudf::test::expect_columns_equal(expected, result->view(), true); } +TEST_F(TransformTest, BasicAdditionLarge) +{ + auto a = thrust::make_counting_iterator(0); + auto col = column_wrapper(a, a + 2000); + auto table = cudf::table_view{{col, col}}; + + auto col_ref = cudf::ast::column_reference(0); + auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref, col_ref); + + auto b = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; }); + auto expected = column_wrapper(b, b + 2000); + auto result = cudf::ast::compute_column(table, expression); + + cudf::test::expect_columns_equal(expected, result->view(), true); +} + TEST_F(TransformTest, LessComparator) { auto c_0 = column_wrapper{3, 20, 1, 50}; @@ -71,6 +90,25 @@ TEST_F(TransformTest, LessComparator) cudf::test::expect_columns_equal(expected, result->view(), true); } +TEST_F(TransformTest, LessComparatorLarge) +{ + auto a = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; }); + auto b = thrust::make_counting_iterator(500); + auto c_0 = column_wrapper(a, a + 2000); + auto c_1 = column_wrapper(b, b + 2000); + auto table = cudf::table_view{{c_0, c_1}}; + + auto col_ref_0 = cudf::ast::column_reference(0); + auto col_ref_1 = cudf::ast::column_reference(1); + auto expression = cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1); + + auto c = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 500; }); + auto expected = column_wrapper(c, c + 2000); + auto result = cudf::ast::compute_column(table, expression); + + cudf::test::expect_columns_equal(expected, result->view(), true); +} + TEST_F(TransformTest, MultiLevelTreeArithmetic) { auto c_0 = column_wrapper{3, 20, 1, 50}; @@ -97,6 +135,34 @@ TEST_F(TransformTest, MultiLevelTreeArithmetic) cudf::test::expect_columns_equal(expected, result->view(), true); } +TEST_F(TransformTest, MultiLevelTreeArithmeticLarge) +{ + using namespace cudf::ast; + + auto a = thrust::make_counting_iterator(0); + auto b = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i + 1; }); + auto c = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; }); + auto c_0 = column_wrapper(a, a + 2000); + auto c_1 = column_wrapper(b, b + 2000); + auto c_2 = column_wrapper(c, c + 2000); + auto table = cudf::table_view{{c_0, c_1, c_2}}; + + auto col_ref_0 = column_reference(0); + auto col_ref_1 = column_reference(1); + auto col_ref_2 = column_reference(2); + + auto expr_left_subtree = expression(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1); + auto expr_right_subtree = expression(cudf::ast::ast_operator::ADD, col_ref_2, col_ref_0); + auto expr_tree = expression(ast_operator::SUB, expr_left_subtree, expr_right_subtree); + + auto result = cudf::ast::compute_column(table, expr_tree); + auto calc = [](auto i) { return (i * (i + 1)) - (i + (i * 2)); }; + auto d = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return calc(i); }); + auto expected = column_wrapper(d, d + 2000); + + cudf::test::expect_columns_equal(expected, result->view(), true); +} + TEST_F(TransformTest, ImbalancedTreeArithmetic) { auto c_0 = column_wrapper{0.15, 0.37, 4.2, 21.3}; From 1eae304bf06855ef2a4619be67626041b4093d0f Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Mon, 22 Feb 2021 08:58:19 -0500 Subject: [PATCH 02/18] Move AST plan initialization to ctor --- cpp/include/cudf/ast/detail/transform.cuh | 9 ++++++++- cpp/src/ast/transform.cu | 22 ++++++---------------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index ee08742d871..e7d2cb6291a 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include #include @@ -318,7 +319,13 @@ __device__ void evaluate_row_expression(detail::row_evaluator const& evaluator, struct ast_plan { public: - ast_plan() : sizes(), data_pointers() {} + ast_plan(linearizer const& expr_linearizer) : sizes(), data_pointers() + { + add_to_plan(expr_linearizer.data_references()); + add_to_plan(expr_linearizer.literals()); + add_to_plan(expr_linearizer.operators()); + add_to_plan(expr_linearizer.operator_source_indices()); + } using buffer_type = std::pair, int>; diff --git a/cpp/src/ast/transform.cu b/cpp/src/ast/transform.cu index bc055d46869..dbba25c3428 100644 --- a/cpp/src/ast/transform.cu +++ b/cpp/src/ast/transform.cu @@ -90,21 +90,10 @@ std::unique_ptr compute_column(table_view const table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - // Linearize the AST - auto const expr_linearizer = linearizer(expr, table); - auto const data_references = expr_linearizer.data_references(); - auto const literals = expr_linearizer.literals(); - auto const operators = expr_linearizer.operators(); - auto const num_operators = cudf::size_type(operators.size()); - auto const operator_source_indices = expr_linearizer.operator_source_indices(); - auto const expr_data_type = expr_linearizer.root_data_type(); - - // Create ast_plan and device buffer - auto plan = ast_plan(); - plan.add_to_plan(data_references); - plan.add_to_plan(literals); - plan.add_to_plan(operators); - plan.add_to_plan(operator_source_indices); + auto const expr_linearizer = linearizer(expr, table); // Linearize the AST + auto const plan = ast_plan(expr_linearizer); // Create ast_plan + + // Create device buffer auto const host_data_buffer = plan.get_host_data_buffer(); auto const buffer_offsets = plan.get_offsets(); auto const buffer_size = host_data_buffer.second; @@ -131,7 +120,7 @@ std::unique_ptr compute_column(table_view const table, // Prepare output column auto output_column = cudf::make_fixed_width_column( - expr_data_type, table_num_rows, mask_state::UNALLOCATED, stream, mr); + expr_linearizer.root_data_type(), table_num_rows, mask_state::UNALLOCATED, stream, mr); auto mutable_output_device = cudf::mutable_column_device_view::create(output_column->mutable_view(), stream); @@ -150,6 +139,7 @@ std::unique_ptr compute_column(table_view const table, : MAX_BLOCK_SIZE; auto const config = cudf::detail::grid_1d{table_num_rows, block_size}; auto const shmem_size_per_block = shmem_size_per_thread * config.num_threads_per_block; + auto const num_operators = static_cast(expr_linearizer.operators().size()); // Execute the kernel cudf::ast::detail::compute_column_kernel From 5af4c599ed212ba17227e6468c23da412578e8a3 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Mon, 22 Feb 2021 09:40:21 -0500 Subject: [PATCH 03/18] AST Plan docs + more cleanup --- cpp/include/cudf/ast/detail/transform.cuh | 43 +++++++++++++++-------- cpp/src/ast/transform.cu | 4 +-- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index e7d2cb6291a..fe20bfcb56b 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -319,7 +319,7 @@ __device__ void evaluate_row_expression(detail::row_evaluator const& evaluator, struct ast_plan { public: - ast_plan(linearizer const& expr_linearizer) : sizes(), data_pointers() + ast_plan(linearizer const& expr_linearizer) : _sizes{}, _data_pointers{} { add_to_plan(expr_linearizer.data_references()); add_to_plan(expr_linearizer.literals()); @@ -329,37 +329,50 @@ struct ast_plan { using buffer_type = std::pair, int>; + /** + * @brief Helper function for adding components (operators, literals, etc) to AST plan + * + * @tparam T The underlying type of the input `std::vector` + * @param v The `std::vector` containing components (operators, literals, etc) + */ template void add_to_plan(std::vector const& v) { auto const data_size = sizeof(T) * v.size(); - sizes.push_back(data_size); - data_pointers.push_back(v.data()); + _sizes.push_back(data_size); + _data_pointers.push_back(v.data()); } - buffer_type get_host_data_buffer() const + /** + * @brief Create and return host buffer + * + * @return `std::pair` containing host buffer and buffer size + */ + buffer_type host_data_buffer() const { - auto const total_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0); + auto const total_size = std::accumulate(_sizes.cbegin(), _sizes.cend(), 0); auto host_data_buffer = std::make_unique(total_size); - auto const offsets = get_offsets(); - for (unsigned int i = 0; i < data_pointers.size(); ++i) { - std::memcpy(host_data_buffer.get() + offsets[i], data_pointers[i], sizes[i]); - } + for (unsigned int i = 0; i < _data_pointers.size(); ++i) + std::memcpy(host_data_buffer.get() + offsets()[i], _data_pointers[i], _sizes[i]); return std::make_pair(std::move(host_data_buffer), total_size); } - std::vector get_offsets() const + /** + * @brief Returns a `std::vector` of offsets into `data_pointers` + * + * @return `std::vector` of offsets into `data_pointers` + */ + std::vector offsets() const { - auto offsets = std::vector(sizes.size()); + auto offsets = std::vector(_sizes.size()); // When C++17, use std::exclusive_scan - offsets[0] = 0; - std::partial_sum(sizes.cbegin(), sizes.cend() - 1, offsets.begin() + 1); + std::partial_sum(_sizes.cbegin(), _sizes.cend() - 1, offsets.begin() + 1); return offsets; } private: - std::vector sizes; - std::vector data_pointers; + std::vector _sizes; + std::vector _data_pointers; }; /** diff --git a/cpp/src/ast/transform.cu b/cpp/src/ast/transform.cu index dbba25c3428..e2db17b48ae 100644 --- a/cpp/src/ast/transform.cu +++ b/cpp/src/ast/transform.cu @@ -94,8 +94,8 @@ std::unique_ptr compute_column(table_view const table, auto const plan = ast_plan(expr_linearizer); // Create ast_plan // Create device buffer - auto const host_data_buffer = plan.get_host_data_buffer(); - auto const buffer_offsets = plan.get_offsets(); + auto const host_data_buffer = plan.host_data_buffer(); + auto const buffer_offsets = plan.offsets(); auto const buffer_size = host_data_buffer.second; auto device_data_buffer = rmm::device_buffer(host_data_buffer.first.get(), buffer_size, stream, mr); From 6ce1addae9538d1ea54b1d838fbe4dbc45d0bada Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 23 Feb 2021 10:38:37 -0500 Subject: [PATCH 04/18] Calculate offsets once --- cpp/include/cudf/ast/detail/transform.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index fe20bfcb56b..acb6a253a59 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -352,8 +352,9 @@ struct ast_plan { { auto const total_size = std::accumulate(_sizes.cbegin(), _sizes.cend(), 0); auto host_data_buffer = std::make_unique(total_size); + auto const offset = offsets(); // calculate once outside for loop for (unsigned int i = 0; i < _data_pointers.size(); ++i) - std::memcpy(host_data_buffer.get() + offsets()[i], _data_pointers[i], _sizes[i]); + std::memcpy(host_data_buffer.get() + offset[i], _data_pointers[i], _sizes[i]); return std::make_pair(std::move(host_data_buffer), total_size); } From ae896e54a5385ac38da51594deb21ade259b4879 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 23 Feb 2021 21:35:42 -0500 Subject: [PATCH 05/18] Move more work to AST ctor, currently failing --- cpp/include/cudf/ast/detail/transform.cuh | 51 ++++++++++++++++++++++- cpp/src/ast/transform.cu | 40 ++++-------------- 2 files changed, 58 insertions(+), 33 deletions(-) diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index acb6a253a59..963ed4f30b2 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -317,16 +317,60 @@ __device__ void evaluate_row_expression(detail::row_evaluator const& evaluator, } } +/** + * @brief The AST plan creates a device buffer of data needed to execute an AST. + * + * On construction, an AST plan creates a single "packed" host buffer of all necessary data arrays, + * and copies that to the device with a single host-device memory copy. Because the plan tends to be + * small, this is the most efficient approach for low latency. + * + * TODO: Remove comment below depending on final design + * The stream is not synchronized automatically, so a stream sync must be performed manually (or by + * another function) before the device data can be used safely. + * + */ struct ast_plan { public: - ast_plan(linearizer const& expr_linearizer) : _sizes{}, _data_pointers{} + ast_plan(linearizer const& expr_linearizer, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : _sizes{}, _data_pointers{} { add_to_plan(expr_linearizer.data_references()); add_to_plan(expr_linearizer.literals()); add_to_plan(expr_linearizer.operators()); add_to_plan(expr_linearizer.operator_source_indices()); + + // Create device buffer + auto const h_data_buffer = host_data_buffer(); + auto const buffer_offsets = offsets(); + auto const buffer_size = h_data_buffer.second; + auto device_data_buffer = + rmm::device_buffer(h_data_buffer.first.get(), buffer_size, stream, mr); + + // To reduce overhead, we don't call a stream sync here. + // The stream is synced later when the table_device_view is created. + // ^^^^ this comment will be removed, we are synchronizing vvvv + stream.synchronize(); // this doesn't seem to work + + // Create device pointers to components of plan + auto const device_data_buffer_ptr = static_cast(device_data_buffer.data()); + _device_data_references = reinterpret_cast( + device_data_buffer_ptr + buffer_offsets[0]); + _device_literals = reinterpret_cast( + device_data_buffer_ptr + buffer_offsets[1]); + _device_operators = + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]); + _device_operator_source_indices = + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]); } + auto device_data_references() const { return _device_data_references; } + auto device_literals() const { return _device_literals; } + auto device_operators() const { return _device_operators; } + auto device_operator_source_indices() const { return _device_operator_source_indices; } + + private: using buffer_type = std::pair, int>; /** @@ -371,9 +415,12 @@ struct ast_plan { return offsets; } - private: std::vector _sizes; std::vector _data_pointers; + const detail::device_data_reference* _device_data_references; + const cudf::detail::fixed_width_scalar_device_view_base* _device_literals; + const ast_operator* _device_operators; + const cudf::size_type* _device_operator_source_indices; }; /** diff --git a/cpp/src/ast/transform.cu b/cpp/src/ast/transform.cu index e2db17b48ae..de57555ff74 100644 --- a/cpp/src/ast/transform.cu +++ b/cpp/src/ast/transform.cu @@ -73,13 +73,12 @@ __launch_bounds__(max_block_size) __global__ { extern __shared__ std::int64_t intermediate_storage[]; auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * num_intermediates]; - auto const start_idx = cudf::size_type(threadIdx.x + blockIdx.x * blockDim.x); - auto const stride = cudf::size_type(blockDim.x * gridDim.x); - auto const num_rows = table.num_rows(); + auto const start_idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + auto const stride = static_cast(blockDim.x * gridDim.x); auto const evaluator = cudf::ast::detail::row_evaluator(table, literals, thread_intermediate_storage, &output_column); - for (cudf::size_type row_index = start_idx; row_index < num_rows; row_index += stride) { + for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) { evaluate_row_expression( evaluator, data_references, operators, operator_source_indices, num_operators, row_index); } @@ -90,29 +89,8 @@ std::unique_ptr compute_column(table_view const table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const expr_linearizer = linearizer(expr, table); // Linearize the AST - auto const plan = ast_plan(expr_linearizer); // Create ast_plan - - // Create device buffer - auto const host_data_buffer = plan.host_data_buffer(); - auto const buffer_offsets = plan.offsets(); - auto const buffer_size = host_data_buffer.second; - auto device_data_buffer = - rmm::device_buffer(host_data_buffer.first.get(), buffer_size, stream, mr); - // To reduce overhead, we don't call a stream sync here. - // The stream is synced later when the table_device_view is created. - - // Create device pointers to components of plan - auto const device_data_buffer_ptr = static_cast(device_data_buffer.data()); - auto const device_data_references = reinterpret_cast( - device_data_buffer_ptr + buffer_offsets[0]); - auto const device_literals = - reinterpret_cast( - device_data_buffer_ptr + buffer_offsets[1]); - auto const device_operators = - reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]); - auto const device_operator_source_indices = - reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]); + auto const expr_linearizer = linearizer(expr, table); // Linearize the AST + auto const plan = ast_plan{expr_linearizer, stream, mr}; // Create ast_plan // Create table device view auto table_device = table_device_view::create(table, stream); @@ -145,11 +123,11 @@ std::unique_ptr compute_column(table_view const table, cudf::ast::detail::compute_column_kernel <<>>( *table_device, - device_literals, + plan.device_literals(), *mutable_output_device, - device_data_references, - device_operators, - device_operator_source_indices, + plan.device_data_references(), + plan.device_operators(), + plan.device_operator_source_indices(), num_operators, num_intermediates); CHECK_CUDA(stream.value()); From 393cdb2d6fc7af2dfc54972311f016f2e34cd491 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 2 Mar 2021 10:49:19 -0500 Subject: [PATCH 06/18] temp --- cpp/include/cudf/ast/detail/transform.cuh | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index 963ed4f30b2..b16cfa963ef 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -286,8 +286,8 @@ __device__ void evaluate_row_expression(detail::row_evaluator const& evaluator, cudf::size_type num_operators, cudf::size_type row_index) { - auto operator_source_index = cudf::size_type(0); - for (cudf::size_type operator_index(0); operator_index < num_operators; operator_index++) { + auto operator_source_index = static_cast(0); + for (cudf::size_type operator_index = 0; operator_index < num_operators; operator_index++) { // Execute operator auto const op = operators[operator_index]; auto const arity = ast_operator_arity(op); @@ -299,6 +299,7 @@ __device__ void evaluate_row_expression(detail::row_evaluator const& evaluator, type_dispatcher(input.data_type, evaluator, row_index, input, output, op); } else if (arity == 2) { // Binary operator + printf("%i", operator_source_index); auto const lhs = data_references[operator_source_indices[operator_source_index]]; auto const rhs = data_references[operator_source_indices[operator_source_index + 1]]; auto const output = data_references[operator_source_indices[operator_source_index + 2]]; @@ -354,15 +355,15 @@ struct ast_plan { stream.synchronize(); // this doesn't seem to work // Create device pointers to components of plan - auto const device_data_buffer_ptr = static_cast(device_data_buffer.data()); - _device_data_references = reinterpret_cast( - device_data_buffer_ptr + buffer_offsets[0]); + _device_data_buffer_ptr = static_cast(device_data_buffer.data()); + _device_data_references = reinterpret_cast( + _device_data_buffer_ptr + buffer_offsets[0]); _device_literals = reinterpret_cast( - device_data_buffer_ptr + buffer_offsets[1]); + _device_data_buffer_ptr + buffer_offsets[1]); _device_operators = - reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]); + reinterpret_cast(_device_data_buffer_ptr + buffer_offsets[2]); _device_operator_source_indices = - reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]); + reinterpret_cast(_device_data_buffer_ptr + buffer_offsets[3]); } auto device_data_references() const { return _device_data_references; } @@ -417,6 +418,7 @@ struct ast_plan { std::vector _sizes; std::vector _data_pointers; + const char* _device_data_buffer_ptr; const detail::device_data_reference* _device_data_references; const cudf::detail::fixed_width_scalar_device_view_base* _device_literals; const ast_operator* _device_operators; From c9315c5f6f0da34d94f92190c09be2e1eb4d4b72 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 16 Apr 2021 18:08:15 -0700 Subject: [PATCH 07/18] Keep device buffer from going out of scope before compute. --- cpp/include/cudf/ast/detail/transform.cuh | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index df064ed47ce..cfb6279cbd5 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -319,7 +319,6 @@ __device__ void evaluate_row_expression(detail::row_evaluator const& evaluator, type_dispatcher(input.data_type, evaluator, row_index, input, output, op); } else if (arity == 2) { // Binary operator - printf("%i", operator_source_index); auto const lhs = data_references[operator_source_indices[operator_source_index]]; auto const rhs = data_references[operator_source_indices[operator_source_index + 1]]; auto const output = data_references[operator_source_indices[operator_source_index + 2]]; @@ -366,8 +365,7 @@ struct ast_plan { auto const h_data_buffer = host_data_buffer(); auto const buffer_offsets = offsets(); auto const buffer_size = h_data_buffer.second; - auto device_data_buffer = - rmm::device_buffer(h_data_buffer.first.get(), buffer_size, stream, mr); + _device_data_buffer = rmm::device_buffer(h_data_buffer.first.get(), buffer_size, stream, mr); // To reduce overhead, we don't call a stream sync here. // The stream is synced later when the table_device_view is created. @@ -375,15 +373,15 @@ struct ast_plan { stream.synchronize(); // this doesn't seem to work // Create device pointers to components of plan - _device_data_buffer_ptr = static_cast(device_data_buffer.data()); - _device_data_references = reinterpret_cast( - _device_data_buffer_ptr + buffer_offsets[0]); + auto device_data_buffer_ptr = static_cast(_device_data_buffer.data()); + _device_data_references = reinterpret_cast( + device_data_buffer_ptr + buffer_offsets[0]); _device_literals = reinterpret_cast( - _device_data_buffer_ptr + buffer_offsets[1]); + device_data_buffer_ptr + buffer_offsets[1]); _device_operators = - reinterpret_cast(_device_data_buffer_ptr + buffer_offsets[2]); + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]); _device_operator_source_indices = - reinterpret_cast(_device_data_buffer_ptr + buffer_offsets[3]); + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]); } auto device_data_references() const { return _device_data_references; } @@ -437,7 +435,8 @@ struct ast_plan { std::vector _sizes; std::vector _data_pointers; - const char* _device_data_buffer_ptr; + + rmm::device_buffer _device_data_buffer; const detail::device_data_reference* _device_data_references; const cudf::detail::fixed_width_scalar_device_view_base* _device_literals; const ast_operator* _device_operators; From 314358b4e20da7bf67fcc33f15fe39cef8e36a25 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 19 Apr 2021 11:04:40 -0700 Subject: [PATCH 08/18] Clean up comments. --- cpp/include/cudf/ast/detail/transform.cuh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index cfb6279cbd5..84a3d99f03d 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -344,10 +344,6 @@ __device__ void evaluate_row_expression(detail::row_evaluator const& evaluator, * and copies that to the device with a single host-device memory copy. Because the plan tends to be * small, this is the most efficient approach for low latency. * - * TODO: Remove comment below depending on final design - * The stream is not synchronized automatically, so a stream sync must be performed manually (or by - * another function) before the device data can be used safely. - * */ struct ast_plan { public: @@ -367,10 +363,7 @@ struct ast_plan { auto const buffer_size = h_data_buffer.second; _device_data_buffer = rmm::device_buffer(h_data_buffer.first.get(), buffer_size, stream, mr); - // To reduce overhead, we don't call a stream sync here. - // The stream is synced later when the table_device_view is created. - // ^^^^ this comment will be removed, we are synchronizing vvvv - stream.synchronize(); // this doesn't seem to work + stream.synchronize(); // Create device pointers to components of plan auto device_data_buffer_ptr = static_cast(_device_data_buffer.data()); From 9c004d02ee36eb6bfe14133231df511814ddb44b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 19 Apr 2021 13:48:24 -0700 Subject: [PATCH 09/18] Inline offset and host buffer creation. --- cpp/include/cudf/ast/detail/transform.cuh | 40 +++++------------------ 1 file changed, 9 insertions(+), 31 deletions(-) diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index 84a3d99f03d..2bccbcd50ce 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -358,10 +358,15 @@ struct ast_plan { add_to_plan(expr_linearizer.operator_source_indices()); // Create device buffer - auto const h_data_buffer = host_data_buffer(); - auto const buffer_offsets = offsets(); - auto const buffer_size = h_data_buffer.second; - _device_data_buffer = rmm::device_buffer(h_data_buffer.first.get(), buffer_size, stream, mr); + auto const buffer_size = std::accumulate(_sizes.cbegin(), _sizes.cend(), 0); + auto buffer_offsets = std::vector(_sizes.size()); + thrust::exclusive_scan(_sizes.cbegin(), _sizes.cend(), buffer_offsets.begin(), 0); + + auto h_data_buffer = std::make_unique(buffer_size); + for (unsigned int i = 0; i < _data_pointers.size(); ++i) + std::memcpy(h_data_buffer.get() + buffer_offsets[i], _data_pointers[i], _sizes[i]); + + _device_data_buffer = rmm::device_buffer(h_data_buffer.get(), buffer_size, stream, mr); stream.synchronize(); @@ -399,33 +404,6 @@ struct ast_plan { _data_pointers.push_back(v.data()); } - /** - * @brief Create and return host buffer - * - * @return `std::pair` containing host buffer and buffer size - */ - buffer_type host_data_buffer() const - { - auto const total_size = std::accumulate(_sizes.cbegin(), _sizes.cend(), 0); - auto host_data_buffer = std::make_unique(total_size); - auto const offset = offsets(); // calculate once outside for loop - for (unsigned int i = 0; i < _data_pointers.size(); ++i) - std::memcpy(host_data_buffer.get() + offset[i], _data_pointers[i], _sizes[i]); - return std::make_pair(std::move(host_data_buffer), total_size); - } - - /** - * @brief Returns a `std::vector` of offsets into `data_pointers` - * - * @return `std::vector` of offsets into `data_pointers` - */ - std::vector offsets() const - { - auto offsets = std::vector(_sizes.size()); - thrust::exclusive_scan(_sizes.cbegin(), _sizes.cend(), offsets.begin(), 0); - return offsets; - } - std::vector _sizes; std::vector _data_pointers; From 253477357336ff91da0b8e3e690c2bec59a6dea2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 19 Apr 2021 15:16:03 -0700 Subject: [PATCH 10/18] Remove preprocessor elided code. --- cpp/include/cudf/ast/detail/operators.hpp | 47 ----------------------- 1 file changed, 47 deletions(-) diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp index 27bcb0d320b..8ae60f96997 100644 --- a/cpp/include/cudf/ast/detail/operators.hpp +++ b/cpp/include/cudf/ast/detail/operators.hpp @@ -753,43 +753,6 @@ struct operator_functor { } }; -#if 0 -/** - * @brief Functor used to double-type-dispatch binary operators. - * - * This functor's `operator()` is templated to validate calls to its operators based on the input - * type, as determined by the `is_valid_binary_op` trait. - * - * @tparam OperatorFunctor Binary operator functor. - */ -template -struct double_dispatch_binary_operator_types { - template >* = nullptr> - CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args) - { - f.template operator()(std::forward(args)...); - } - - template >* = nullptr> - CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args) - { -#ifndef __CUDA_ARCH__ - CUDF_FAIL("Invalid binary operation."); -#else - cudf_assert(false && "Invalid binary operation."); -#endif - } -}; -#endif - /** * @brief Functor used to single-type-dispatch binary operators. * @@ -856,16 +819,6 @@ struct type_dispatch_binary_op { F&& f, Ts&&... args) { -#if 0 - // Double dispatch - /* - double_type_dispatcher(lhs_type, - rhs_type, - detail::double_dispatch_binary_operator_types>{}, - std::forward(f), - std::forward(args)...); - */ -#endif // Single dispatch (assume lhs_type == rhs_type) type_dispatcher(lhs_type, detail::single_dispatch_binary_operator_types>{}, From f924671b4b6053e83b38e1b750a3b0da2b754ded Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 21 Apr 2021 11:00:02 -0700 Subject: [PATCH 11/18] Remove unnecessary forward declarations and friend relationships between nodes and linearizer. --- cpp/include/cudf/ast/detail/linearizer.hpp | 9 +-------- cpp/include/cudf/ast/linearizer.hpp | 17 +++-------------- 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/cpp/include/cudf/ast/detail/linearizer.hpp b/cpp/include/cudf/ast/detail/linearizer.hpp index 68319a24e5d..166a0408703 100644 --- a/cpp/include/cudf/ast/detail/linearizer.hpp +++ b/cpp/include/cudf/ast/detail/linearizer.hpp @@ -83,10 +83,7 @@ class linearizer; * This class is a part of a "visitor" pattern with the `linearizer` class. * Nodes inheriting from this class can accept visitors. */ -class node { - friend class detail::linearizer; - - private: +struct node { virtual cudf::size_type accept(detail::linearizer& visitor) const = 0; }; @@ -102,10 +99,6 @@ class node { * resolved into intermediate data storage in shared memory. */ class linearizer { - friend class literal; - friend class column_reference; - friend class expression; - public: /** * @brief Construct a new linearizer object diff --git a/cpp/include/cudf/ast/linearizer.hpp b/cpp/include/cudf/ast/linearizer.hpp index e5ccb2e8069..9f686f9116d 100644 --- a/cpp/include/cudf/ast/linearizer.hpp +++ b/cpp/include/cudf/ast/linearizer.hpp @@ -38,17 +38,10 @@ enum class table_reference { OUTPUT // Column index in the output table }; -// Forward declaration -class literal; -class column_reference; -class expression; - /** * @brief A literal value used in an abstract syntax tree. */ class literal : public detail::node { - friend class detail::linearizer; - public: /** * @brief Construct a new literal object. @@ -90,7 +83,6 @@ class literal : public detail::node { */ cudf::data_type get_data_type() const { return get_value().type(); } - private: /** * @brief Get the value object. * @@ -106,6 +98,7 @@ class literal : public detail::node { */ cudf::size_type accept(detail::linearizer& visitor) const override; + private: const cudf::detail::fixed_width_scalar_device_view_base value; }; @@ -113,8 +106,6 @@ class literal : public detail::node { * @brief A node referring to data from a column in a table. */ class column_reference : public detail::node { - friend class detail::linearizer; - public: /** * @brief Construct a new column reference object @@ -175,7 +166,6 @@ class column_reference : public detail::node { return table.column(get_column_index()).type(); } - private: /** * @brief Accepts a visitor class. * @@ -184,6 +174,7 @@ class column_reference : public detail::node { */ cudf::size_type accept(detail::linearizer& visitor) const override; + private: cudf::size_type column_index; table_reference table_source; }; @@ -192,8 +183,6 @@ class column_reference : public detail::node { * @brief An expression node holds an operator and zero or more operands. */ class expression : public detail::node { - friend class detail::linearizer; - public: /** * @brief Construct a new unary expression object. @@ -256,7 +245,6 @@ class expression : public detail::node { */ std::vector> get_operands() const { return operands; } - private: /** * @brief Accepts a visitor class. * @@ -265,6 +253,7 @@ class expression : public detail::node { */ cudf::size_type accept(detail::linearizer& visitor) const override; + private: const ast_operator op; const std::vector> operands; }; From 459eae1d4d03ee1dc8792c20fbef02ebee2146d7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 21 Apr 2021 11:04:52 -0700 Subject: [PATCH 12/18] Add more informative comment for why r-value expression constructors are deleted. --- cpp/include/cudf/ast/linearizer.hpp | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/cpp/include/cudf/ast/linearizer.hpp b/cpp/include/cudf/ast/linearizer.hpp index 9f686f9116d..70dda58816e 100644 --- a/cpp/include/cudf/ast/linearizer.hpp +++ b/cpp/include/cudf/ast/linearizer.hpp @@ -197,11 +197,6 @@ class expression : public detail::node { } } - /** - * @brief `expression` doesn't accept r-value references for expression nodes - */ - expression(ast_operator op, node&& input) = delete; - /** * @brief Construct a new binary expression object. * @@ -216,19 +211,11 @@ class expression : public detail::node { } } - /** - * @brief `expression` doesn't accept r-value references for expression nodes - */ - expression(ast_operator op, node&& left, node&& right) = delete; - - /** - * @brief `expression` doesn't accept r-value references for expression nodes - */ + // expression only stores references to nodes, so it does not accept r-value + // references: the calling code must own the nodes. + expression(ast_operator op, node&& input) = delete; + expression(ast_operator op, node&& left, node&& right) = delete; expression(ast_operator op, node&& left, node const& right) = delete; - - /** - * @brief `expression` doesn't accept r-value references for expression nodes - */ expression(ast_operator op, node const& left, node&& right) = delete; /** From 33bf98d854727698e3675dec7b3c4a4212680542 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 21 Apr 2021 11:08:57 -0700 Subject: [PATCH 13/18] Move linearizer.hpp to nodes.hpp. --- cpp/include/cudf/ast/detail/transform.cuh | 2 +- cpp/include/cudf/ast/{linearizer.hpp => nodes.hpp} | 0 cpp/include/cudf/ast/transform.hpp | 2 +- cpp/src/ast/linearizer.cpp | 2 +- cpp/src/ast/transform.cu | 2 +- 5 files changed, 4 insertions(+), 4 deletions(-) rename cpp/include/cudf/ast/{linearizer.hpp => nodes.hpp} (100%) diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index 2bccbcd50ce..d03cef2190b 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include diff --git a/cpp/include/cudf/ast/linearizer.hpp b/cpp/include/cudf/ast/nodes.hpp similarity index 100% rename from cpp/include/cudf/ast/linearizer.hpp rename to cpp/include/cudf/ast/nodes.hpp diff --git a/cpp/include/cudf/ast/transform.hpp b/cpp/include/cudf/ast/transform.hpp index 513f92ea251..59697e5f75c 100644 --- a/cpp/include/cudf/ast/transform.hpp +++ b/cpp/include/cudf/ast/transform.hpp @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include namespace cudf { diff --git a/cpp/src/ast/linearizer.cpp b/cpp/src/ast/linearizer.cpp index cc70845e1ff..66a32ead35e 100644 --- a/cpp/src/ast/linearizer.cpp +++ b/cpp/src/ast/linearizer.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ #include -#include +#include #include #include #include diff --git a/cpp/src/ast/transform.cu b/cpp/src/ast/transform.cu index de57555ff74..ac056db53d9 100644 --- a/cpp/src/ast/transform.cu +++ b/cpp/src/ast/transform.cu @@ -15,7 +15,7 @@ */ #include -#include +#include #include #include #include From 8781a3b5c8ae6da7ca2e6c29b3e351f2da97e01b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 21 Apr 2021 11:15:07 -0700 Subject: [PATCH 14/18] Make ast plan internals public and use them directly. --- cpp/include/cudf/ast/detail/transform.cuh | 9 --------- cpp/src/ast/transform.cu | 8 ++++---- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index d03cef2190b..5354ba52658 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -346,7 +346,6 @@ __device__ void evaluate_row_expression(detail::row_evaluator const& evaluator, * */ struct ast_plan { - public: ast_plan(linearizer const& expr_linearizer, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -382,14 +381,6 @@ struct ast_plan { reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]); } - auto device_data_references() const { return _device_data_references; } - auto device_literals() const { return _device_literals; } - auto device_operators() const { return _device_operators; } - auto device_operator_source_indices() const { return _device_operator_source_indices; } - - private: - using buffer_type = std::pair, int>; - /** * @brief Helper function for adding components (operators, literals, etc) to AST plan * diff --git a/cpp/src/ast/transform.cu b/cpp/src/ast/transform.cu index ac056db53d9..5c94c89f462 100644 --- a/cpp/src/ast/transform.cu +++ b/cpp/src/ast/transform.cu @@ -123,11 +123,11 @@ std::unique_ptr compute_column(table_view const table, cudf::ast::detail::compute_column_kernel <<>>( *table_device, - plan.device_literals(), + plan._device_literals, *mutable_output_device, - plan.device_data_references(), - plan.device_operators(), - plan.device_operator_source_indices(), + plan._device_data_references, + plan._device_operators, + plan._device_operator_source_indices, num_operators, num_intermediates); CHECK_CUDA(stream.value()); From 4ed23dd03e5c7752e623b30004b97a82bf0aca6b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 21 Apr 2021 12:26:33 -0700 Subject: [PATCH 15/18] Apply rename in meta.yaml. --- conda/recipes/libcudf/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 75955428eab..00dc36ca823 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -55,7 +55,7 @@ test: - test -f $PREFIX/include/cudf/ast/transform.hpp - test -f $PREFIX/include/cudf/ast/detail/linearizer.hpp - test -f $PREFIX/include/cudf/ast/detail/operators.hpp - - test -f $PREFIX/include/cudf/ast/linearizer.hpp + - test -f $PREFIX/include/cudf/ast/nodes.hpp - test -f $PREFIX/include/cudf/ast/operators.hpp - test -f $PREFIX/include/cudf/binaryop.hpp - test -f $PREFIX/include/cudf/labeling/label_bins.hpp From f0a5c6687d0d26f624fbded8757d66ca4ef92a88 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 May 2021 10:34:38 -0700 Subject: [PATCH 16/18] Change raw pointers in ast_plan to span. --- cpp/include/cudf/ast/detail/transform.cuh | 30 ++++++++++++++--------- cpp/src/ast/transform.cu | 8 +++--- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index 5354ba52658..20c9e2953ed 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -371,14 +372,19 @@ struct ast_plan { // Create device pointers to components of plan auto device_data_buffer_ptr = static_cast(_device_data_buffer.data()); - _device_data_references = reinterpret_cast( - device_data_buffer_ptr + buffer_offsets[0]); - _device_literals = reinterpret_cast( - device_data_buffer_ptr + buffer_offsets[1]); - _device_operators = - reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]); - _device_operator_source_indices = - reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]); + _device_data_references = device_span( + reinterpret_cast(device_data_buffer_ptr + + buffer_offsets[0]), + _sizes[0]); + _device_literals = device_span( + reinterpret_cast( + device_data_buffer_ptr + buffer_offsets[1]), + _sizes[1]); + _device_operators = device_span( + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]), _sizes[2]); + _device_operator_source_indices = device_span( + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]), + _sizes[3]); } /** @@ -399,10 +405,10 @@ struct ast_plan { std::vector _data_pointers; rmm::device_buffer _device_data_buffer; - const detail::device_data_reference* _device_data_references; - const cudf::detail::fixed_width_scalar_device_view_base* _device_literals; - const ast_operator* _device_operators; - const cudf::size_type* _device_operator_source_indices; + device_span _device_data_references; + device_span _device_literals; + device_span _device_operators; + device_span _device_operator_source_indices; }; /** diff --git a/cpp/src/ast/transform.cu b/cpp/src/ast/transform.cu index 5c94c89f462..1577ae925bc 100644 --- a/cpp/src/ast/transform.cu +++ b/cpp/src/ast/transform.cu @@ -123,11 +123,11 @@ std::unique_ptr compute_column(table_view const table, cudf::ast::detail::compute_column_kernel <<>>( *table_device, - plan._device_literals, + plan._device_literals.data(), *mutable_output_device, - plan._device_data_references, - plan._device_operators, - plan._device_operator_source_indices, + plan._device_data_references.data(), + plan._device_operators.data(), + plan._device_operator_source_indices.data(), num_operators, num_intermediates); CHECK_CUDA(stream.value()); From d2a5e8b651501bde1bec624c1d49b38b698eddde Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 May 2021 10:42:35 -0700 Subject: [PATCH 17/18] Pass the spans through the code. --- cpp/include/cudf/ast/detail/transform.cuh | 24 +++++++++++---------- cpp/src/ast/transform.cu | 26 +++++++++++------------ 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index 20c9e2953ed..f427d9b52c9 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -157,10 +157,11 @@ struct row_evaluator { * storing intermediates. * @param output_column The output column where results are stored. */ - __device__ row_evaluator(table_device_view const& table, - const cudf::detail::fixed_width_scalar_device_view_base* literals, - std::int64_t* thread_intermediate_storage, - mutable_column_device_view* output_column) + __device__ row_evaluator( + table_device_view const& table, + device_span literals, + std::int64_t* thread_intermediate_storage, + mutable_column_device_view* output_column) : table(table), literals(literals), thread_intermediate_storage(thread_intermediate_storage), @@ -266,7 +267,7 @@ struct row_evaluator { private: table_device_view const& table; - const cudf::detail::fixed_width_scalar_device_view_base* literals; + device_span literals; std::int64_t* thread_intermediate_storage; mutable_column_device_view* output_column; }; @@ -300,12 +301,13 @@ __device__ void row_output::resolve_output(detail::device_data_reference device_ * @param num_operators Number of operators. * @param row_index Row index of data column(s). */ -__device__ void evaluate_row_expression(detail::row_evaluator const& evaluator, - const detail::device_data_reference* data_references, - const ast_operator* operators, - const cudf::size_type* operator_source_indices, - cudf::size_type num_operators, - cudf::size_type row_index) +__device__ void evaluate_row_expression( + detail::row_evaluator const& evaluator, + device_span data_references, + device_span operators, + device_span operator_source_indices, + cudf::size_type num_operators, + cudf::size_type row_index) { auto operator_source_index = static_cast(0); for (cudf::size_type operator_index = 0; operator_index < num_operators; operator_index++) { diff --git a/cpp/src/ast/transform.cu b/cpp/src/ast/transform.cu index 1577ae925bc..c812e773c7d 100644 --- a/cpp/src/ast/transform.cu +++ b/cpp/src/ast/transform.cu @@ -61,15 +61,15 @@ namespace detail { * each thread. */ template -__launch_bounds__(max_block_size) __global__ - void compute_column_kernel(table_device_view const table, - const cudf::detail::fixed_width_scalar_device_view_base* literals, - mutable_column_device_view output_column, - const detail::device_data_reference* data_references, - const ast_operator* operators, - const cudf::size_type* operator_source_indices, - cudf::size_type num_operators, - cudf::size_type num_intermediates) +__launch_bounds__(max_block_size) __global__ void compute_column_kernel( + table_device_view const table, + device_span literals, + mutable_column_device_view output_column, + device_span data_references, + device_span operators, + device_span operator_source_indices, + cudf::size_type num_operators, + cudf::size_type num_intermediates) { extern __shared__ std::int64_t intermediate_storage[]; auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * num_intermediates]; @@ -123,11 +123,11 @@ std::unique_ptr compute_column(table_view const table, cudf::ast::detail::compute_column_kernel <<>>( *table_device, - plan._device_literals.data(), + plan._device_literals, *mutable_output_device, - plan._device_data_references.data(), - plan._device_operators.data(), - plan._device_operator_source_indices.data(), + plan._device_data_references, + plan._device_operators, + plan._device_operator_source_indices, num_operators, num_intermediates); CHECK_CUDA(stream.value()); From 98b219a4f9105c914d2f69e11ff2c7c83156b12f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 5 May 2021 17:00:35 -0700 Subject: [PATCH 18/18] Address PR comments. --- cpp/include/cudf/ast/detail/transform.cuh | 15 ++++++++------- cpp/src/ast/transform.cu | 5 +---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index f427d9b52c9..f69927a3601 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -306,11 +306,10 @@ __device__ void evaluate_row_expression( device_span data_references, device_span operators, device_span operator_source_indices, - cudf::size_type num_operators, cudf::size_type row_index) { auto operator_source_index = static_cast(0); - for (cudf::size_type operator_index = 0; operator_index < num_operators; operator_index++) { + for (cudf::size_type operator_index = 0; operator_index < operators.size(); operator_index++) { // Execute operator auto const op = operators[operator_index]; auto const arity = ast_operator_arity(op); @@ -365,8 +364,9 @@ struct ast_plan { thrust::exclusive_scan(_sizes.cbegin(), _sizes.cend(), buffer_offsets.begin(), 0); auto h_data_buffer = std::make_unique(buffer_size); - for (unsigned int i = 0; i < _data_pointers.size(); ++i) + for (unsigned int i = 0; i < _data_pointers.size(); ++i) { std::memcpy(h_data_buffer.get() + buffer_offsets[i], _data_pointers[i], _sizes[i]); + } _device_data_buffer = rmm::device_buffer(h_data_buffer.get(), buffer_size, stream, mr); @@ -377,16 +377,17 @@ struct ast_plan { _device_data_references = device_span( reinterpret_cast(device_data_buffer_ptr + buffer_offsets[0]), - _sizes[0]); + expr_linearizer.data_references().size()); _device_literals = device_span( reinterpret_cast( device_data_buffer_ptr + buffer_offsets[1]), - _sizes[1]); + expr_linearizer.literals().size()); _device_operators = device_span( - reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]), _sizes[2]); + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]), + expr_linearizer.operators().size()); _device_operator_source_indices = device_span( reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]), - _sizes[3]); + expr_linearizer.operator_source_indices().size()); } /** diff --git a/cpp/src/ast/transform.cu b/cpp/src/ast/transform.cu index c812e773c7d..43d3bde97c2 100644 --- a/cpp/src/ast/transform.cu +++ b/cpp/src/ast/transform.cu @@ -68,7 +68,6 @@ __launch_bounds__(max_block_size) __global__ void compute_column_kernel( device_span data_references, device_span operators, device_span operator_source_indices, - cudf::size_type num_operators, cudf::size_type num_intermediates) { extern __shared__ std::int64_t intermediate_storage[]; @@ -80,7 +79,7 @@ __launch_bounds__(max_block_size) __global__ void compute_column_kernel( for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) { evaluate_row_expression( - evaluator, data_references, operators, operator_source_indices, num_operators, row_index); + evaluator, data_references, operators, operator_source_indices, row_index); } } @@ -117,7 +116,6 @@ std::unique_ptr compute_column(table_view const table, : MAX_BLOCK_SIZE; auto const config = cudf::detail::grid_1d{table_num_rows, block_size}; auto const shmem_size_per_block = shmem_size_per_thread * config.num_threads_per_block; - auto const num_operators = static_cast(expr_linearizer.operators().size()); // Execute the kernel cudf::ast::detail::compute_column_kernel @@ -128,7 +126,6 @@ std::unique_ptr compute_column(table_view const table, plan._device_data_references, plan._device_operators, plan._device_operator_source_indices, - num_operators, num_intermediates); CHECK_CUDA(stream.value()); return output_column;