diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 6c4175a2539..75bfe6c34bc 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -52,7 +52,7 @@ test: - test -f $PREFIX/lib/libcudftestutil.a - test -f $PREFIX/include/cudf/aggregation.hpp - test -f $PREFIX/include/cudf/ast/transform.hpp - - test -f $PREFIX/include/cudf/ast/detail/linearizer.hpp + - test -f $PREFIX/include/cudf/ast/detail/expression_parser.hpp - test -f $PREFIX/include/cudf/ast/detail/operators.hpp - test -f $PREFIX/include/cudf/ast/nodes.hpp - test -f $PREFIX/include/cudf/ast/operators.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5c05a58b448..90c17067b55 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -151,7 +151,7 @@ add_library(cudf src/aggregation/aggregation.cpp src/aggregation/aggregation.cu src/aggregation/result_cache.cpp - src/ast/linearizer.cpp + src/ast/expression_parser.cpp src/ast/transform.cu src/binaryop/binaryop.cpp src/binaryop/compiled/binary_ops.cu diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp new file mode 100644 index 00000000000..db8845825c5 --- /dev/null +++ b/cpp/include/cudf/ast/detail/expression_parser.hpp @@ -0,0 +1,339 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include + +namespace cudf { +namespace ast { +namespace detail { + +/** + * @brief Enum defining data reference types used by a node. + * + * This enum is device-specific. For instance, intermediate data references are generated by the + * linearization process but cannot be explicitly created by the user. + */ +enum class device_data_reference_type { + COLUMN, // A value in a table column + LITERAL, // A literal value + INTERMEDIATE // An internal temporary value +}; + +/** + * @brief A device data reference describes a source of data used by a node. + * + * This is a POD class used to create references describing data type and locations for consumption + * by the `row_evaluator`. + */ +struct alignas(8) device_data_reference { + device_data_reference(device_data_reference_type reference_type, + cudf::data_type data_type, + cudf::size_type data_index, + table_reference table_source); + + device_data_reference(device_data_reference_type reference_type, + cudf::data_type data_type, + cudf::size_type data_index); + + device_data_reference_type const reference_type; // Source of data + cudf::data_type const data_type; // Type of data + cudf::size_type const data_index; // The column index of a table, index of a + // literal, or index of an intermediate + table_reference const table_source; + + bool operator==(device_data_reference const& rhs) const + { + return std::tie(data_index, reference_type, table_source) == + std::tie(rhs.data_index, rhs.reference_type, rhs.table_source); + } +}; + +// Type trait for wrapping nullable types in a thrust::optional. Non-nullable +// types are returned as is. +template +struct possibly_null_value; + +template +struct possibly_null_value { + using type = thrust::optional; +}; + +template +struct possibly_null_value { + using type = T; +}; + +template +using possibly_null_value_t = typename possibly_null_value::type; + +// Type used for intermediate storage in expression evaluation. +template +using IntermediateDataType = possibly_null_value_t; + +/** + * @brief A container of all device data required to evaluate an expression on tables. + * + * This struct should never be instantiated directly. It is created by the + * `expression_parser` on construction, and the resulting member is publicly accessible + * for passing to kernels for constructing an `expression_evaluator`. + * + */ +struct expression_device_view { + device_span data_references; + device_span literals; + device_span operators; + device_span operator_source_indices; + cudf::size_type num_intermediates; + int shmem_per_thread; +}; + +/** + * @brief The expression_parser traverses an expression and converts it into a form suitable for + * execution on the device. + * + * This class is part of a "visitor" pattern with the `node` class. + * + * This class does pre-processing work on the host, validating operators and operand data types. It + * traverses downward from a root node in a depth-first fashion, capturing information about + * the nodes and constructing vectors of information that are later used by the device for + * evaluating the abstract syntax tree as a "linear" list of operators whose input dependencies are + * resolved into intermediate data storage in shared memory. + */ +class expression_parser { + public: + /** + * @brief Construct a new expression_parser object + * + * @param expr The expression to create an evaluable expression_parser for. + * @param left The left table used for evaluating the abstract syntax tree. + * @param right The right table used for evaluating the abstract syntax tree. + */ + expression_parser(node const& expr, + cudf::table_view left, + cudf::table_view right, + bool has_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : _left{left}, _right{right}, _node_count{0}, _intermediate_counter{}, _has_nulls(has_nulls) + { + expr.accept(*this); + move_to_device(stream, mr); + } + + /** + * @brief Construct a new expression_parser object + * + * @param expr The expression to create an evaluable expression_parser for. + * @param table The table used for evaluating the abstract syntax tree. + */ + expression_parser(node const& expr, + cudf::table_view table, + bool has_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : expression_parser(expr, table, table, has_nulls, stream, mr) + { + } + + /** + * @brief Get the root data type of the abstract syntax tree. + * + * @return cudf::data_type + */ + cudf::data_type output_type() const; + + /** + * @brief Visit a literal node. + * + * @param expr Literal node. + * @return cudf::size_type Index of device data reference for the node. + */ + cudf::size_type visit(literal const& expr); + + /** + * @brief Visit a column reference node. + * + * @param expr Column reference node. + * @return cudf::size_type Index of device data reference for the node. + */ + cudf::size_type visit(column_reference const& expr); + + /** + * @brief Visit an expression node. + * + * @param expr Expression node. + * @return cudf::size_type Index of device data reference for the node. + */ + cudf::size_type visit(expression const& expr); + + /** + * @brief Internal class used to track the utilization of intermediate storage locations. + * + * As nodes are being evaluated, they may generate "intermediate" data that is immediately + * consumed. Rather than manifesting this data in global memory, we can store intermediates of any + * fixed width type (up to 8 bytes) by placing them in shared memory. This class helps to track + * the number and indices of intermediate data in shared memory using a give-take model. Locations + * in shared memory can be "taken" and used for storage, "given back," and then later re-used. + * This aims to minimize the maximum amount of shared memory needed at any point during the + * evaluation. + * + */ + class intermediate_counter { + public: + intermediate_counter() : used_values(), max_used(0) {} + cudf::size_type take(); + void give(cudf::size_type value); + cudf::size_type get_max_used() const { return max_used; } + + private: + /** + * @brief Find the first missing value in a contiguous sequence of integers. + * + * From a sorted container of integers, find the first "missing" value. + * For example, {0, 1, 2, 4, 5} is missing 3, and {1, 2, 3} is missing 0. + * If there are no missing values, return the size of the container. + * + * @return cudf::size_type Smallest value not already in the container. + */ + cudf::size_type find_first_missing() const; + + std::vector used_values; + cudf::size_type max_used; + }; + + expression_device_view device_expression_data; ///< The collection of data required to evaluate + ///< the expression on the device. + + private: + /** + * @brief Helper function for adding components (operators, literals, etc) to AST plan + * + * @tparam T The underlying type of the input `std::vector` + * @param[in] v The `std::vector` containing components (operators, literals, etc). + * @param[in,out] sizes The `std::vector` containing the size of each data buffer. + * @param[in,out] data_pointers The `std::vector` containing pointers to each data buffer. + */ + template + void extract_size_and_pointer(std::vector const& v, + std::vector& sizes, + std::vector& data_pointers) + { + auto const data_size = sizeof(T) * v.size(); + sizes.push_back(data_size); + data_pointers.push_back(v.data()); + } + + void move_to_device(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) + { + std::vector sizes; + std::vector data_pointers; + + extract_size_and_pointer(_data_references, sizes, data_pointers); + extract_size_and_pointer(_literals, sizes, data_pointers); + extract_size_and_pointer(_operators, sizes, data_pointers); + extract_size_and_pointer(_operator_source_indices, sizes, data_pointers); + + // Create device buffer + auto const buffer_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0); + auto buffer_offsets = std::vector(sizes.size()); + thrust::exclusive_scan(sizes.cbegin(), sizes.cend(), buffer_offsets.begin(), 0); + + auto h_data_buffer = std::vector(buffer_size); + for (unsigned int i = 0; i < data_pointers.size(); ++i) { + std::memcpy(h_data_buffer.data() + buffer_offsets[i], data_pointers[i], sizes[i]); + } + + _device_data_buffer = rmm::device_buffer(h_data_buffer.data(), buffer_size, stream, mr); + + stream.synchronize(); + + // Create device pointers to components of plan + auto device_data_buffer_ptr = static_cast(_device_data_buffer.data()); + device_expression_data.data_references = device_span( + reinterpret_cast(device_data_buffer_ptr + + buffer_offsets[0]), + _data_references.size()); + device_expression_data.literals = + device_span( + reinterpret_cast( + device_data_buffer_ptr + buffer_offsets[1]), + _literals.size()); + device_expression_data.operators = device_span( + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]), + _operators.size()); + device_expression_data.operator_source_indices = device_span( + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]), + _operator_source_indices.size()); + device_expression_data.num_intermediates = _intermediate_counter.get_max_used(); + device_expression_data.shmem_per_thread = static_cast( + (_has_nulls ? sizeof(IntermediateDataType) : sizeof(IntermediateDataType)) * + device_expression_data.num_intermediates); + } + + /** + * @brief Helper function for recursive traversal of expressions. + * + * When parsing an expression composed of subexpressions, all subexpressions + * must be evaluated before an operator can be applied to them. This method + * performs that recursive traversal (in conjunction with the + * `expression_parser.visit` and `expression.accept` methods if necessary to + * descend deeper into an expression tree). + * + * @param operands The operands to visit. + * + * @return The indices of the operands stored in the data references. + */ + std::vector visit_operands( + std::vector> operands); + + /** + * @brief Add a data reference to the internal list. + * + * @param data_ref The data reference to add. + * + * @return The index of the added data reference in the internal data references list. + */ + cudf::size_type add_data_reference(detail::device_data_reference data_ref); + + rmm::device_buffer + _device_data_buffer; ///< The device-side data buffer containing the plan information, which is + ///< owned by this class and persists until it is destroyed. + + cudf::table_view const& _left; + cudf::table_view const& _right; + cudf::size_type _node_count; + intermediate_counter _intermediate_counter; + bool _has_nulls; + std::vector _data_references; + std::vector _operators; + std::vector _operator_source_indices; + std::vector _literals; +}; + +} // namespace detail + +} // namespace ast + +} // namespace cudf diff --git a/cpp/include/cudf/ast/detail/linearizer.hpp b/cpp/include/cudf/ast/detail/linearizer.hpp deleted file mode 100644 index 59eda1df7b7..00000000000 --- a/cpp/include/cudf/ast/detail/linearizer.hpp +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include - -namespace cudf { -namespace ast { - -// Forward declaration -enum class table_reference; -class literal; -class column_reference; -class expression; - -namespace detail { - -/** - * @brief Enum defining data reference types used by a node. - * - * This enum is device-specific. For instance, intermediate data references are generated by the - * linearization process but cannot be explicitly created by the user. - */ -enum class device_data_reference_type { - COLUMN, // A value in a table column - LITERAL, // A literal value - INTERMEDIATE // An internal temporary value -}; - -/** - * @brief A device data reference describes a source of data used by a node. - * - * This is a POD class used to create references describing data type and locations for consumption - * by the `row_evaluator`. - */ -struct alignas(8) device_data_reference { - device_data_reference(device_data_reference_type reference_type, - cudf::data_type data_type, - cudf::size_type data_index, - table_reference table_source); - - device_data_reference(device_data_reference_type reference_type, - cudf::data_type data_type, - cudf::size_type data_index); - - const device_data_reference_type reference_type; // Source of data - const cudf::data_type data_type; // Type of data - const cudf::size_type data_index; // The column index of a table, index of a - // literal, or index of an intermediate - const table_reference table_source; - - inline bool operator==(const device_data_reference& rhs) const - { - return std::tie(data_index, reference_type, table_source) == - std::tie(rhs.data_index, rhs.reference_type, rhs.table_source); - } -}; - -// Forward declaration -class linearizer; - -/** - * @brief A generic node that can be evaluated to return a value. - * - * This class is a part of a "visitor" pattern with the `linearizer` class. - * Nodes inheriting from this class can accept visitors. - */ -struct node { - virtual cudf::size_type accept(detail::linearizer& visitor) const = 0; -}; - -/** - * @brief The linearizer traverses an abstract syntax tree to prepare for execution on the device. - * - * This class is part of a "visitor" pattern with the `node` class. - * - * This class does pre-processing work on the host, validating operators and operand data types. It - * traverses downward from a root node in a depth-first fashion, capturing information about - * the nodes and constructing vectors of information that are later used by the device for - * evaluating the abstract syntax tree as a "linear" list of operators whose input dependencies are - * resolved into intermediate data storage in shared memory. - */ -class linearizer { - public: - /** - * @brief Construct a new linearizer object - * - * @param expr The expression to create an evaluable linearizer for. - * @param left The left table used for evaluating the abstract syntax tree. - * @param right The right table used for evaluating the abstract syntax tree. - */ - linearizer(detail::node const& expr, cudf::table_view left, cudf::table_view right) - : _left{left}, _right{right}, _node_count{0}, _intermediate_counter{} - { - expr.accept(*this); - } - - /** - * @brief Construct a new linearizer object - * - * @param expr The expression to create an evaluable linearizer for. - * @param table The table used for evaluating the abstract syntax tree. - */ - linearizer(detail::node const& expr, cudf::table_view table) - : _left{table}, _right{table}, _node_count{0}, _intermediate_counter{} - { - expr.accept(*this); - } - - /** - * @brief Get the root data type of the abstract syntax tree. - * - * @return cudf::data_type - */ - cudf::data_type root_data_type() const; - - /** - * @brief Get the maximum number of intermediates stored by the abstract syntax tree. - * - * @return cudf::size_type - */ - cudf::size_type intermediate_count() const { return _intermediate_counter.get_max_used(); } - - /** - * @brief Get the device data references. - * - * @return std::vector - */ - std::vector const& data_references() const - { - return _data_references; - } - - /** - * @brief Get the operators. - * - * @return std::vector - */ - std::vector const& operators() const { return _operators; } - - /** - * @brief Get the operator source indices. - * - * @return std::vector - */ - std::vector const& operator_source_indices() const - { - return _operator_source_indices; - } - - /** - * @brief Get the literal device views. - * - * @return std::vector - */ - std::vector const& literals() const - { - return _literals; - } - - /** - * @brief Visit a literal node. - * - * @param expr Literal node. - * @return cudf::size_type Index of device data reference for the node. - */ - cudf::size_type visit(literal const& expr); - - /** - * @brief Visit a column reference node. - * - * @param expr Column reference node. - * @return cudf::size_type Index of device data reference for the node. - */ - cudf::size_type visit(column_reference const& expr); - - /** - * @brief Visit an expression node. - * - * @param expr Expression node. - * @return cudf::size_type Index of device data reference for the node. - */ - cudf::size_type visit(expression const& expr); - - /** - * @brief Internal class used to track the utilization of intermediate storage locations. - * - * As nodes are being evaluated, they may generate "intermediate" data that is immediately - * consumed. Rather than manifesting this data in global memory, we can store intermediates of any - * fixed width type (up to 8 bytes) by placing them in shared memory. This class helps to track - * the number and indices of intermediate data in shared memory using a give-take model. Locations - * in shared memory can be "taken" and used for storage, "given back," and then later re-used. - * This aims to minimize the maximum amount of shared memory needed at any point during the - * evaluation. - * - */ - class intermediate_counter { - public: - intermediate_counter() : used_values(), max_used(0) {} - cudf::size_type take(); - void give(cudf::size_type value); - cudf::size_type get_max_used() const { return max_used; } - - private: - cudf::size_type find_first_missing() const; - std::vector used_values; - cudf::size_type max_used; - }; - - private: - std::vector visit_operands( - std::vector> operands); - cudf::size_type add_data_reference(detail::device_data_reference data_ref); - - // State information about the "linearized" GPU execution plan - cudf::table_view const& _left; - cudf::table_view const& _right; - cudf::size_type _node_count; - intermediate_counter _intermediate_counter; - std::vector _data_references; - std::vector _operators; - std::vector _operator_source_indices; - std::vector _literals; -}; - -} // namespace detail - -} // namespace ast - -} // namespace cudf diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp index 01ec5b74b77..fd3a0775401 100644 --- a/cpp/include/cudf/ast/detail/operators.hpp +++ b/cpp/include/cudf/ast/detail/operators.hpp @@ -787,14 +787,6 @@ struct single_dispatch_binary_operator_types { } }; -struct single_dispatch_binary_operator { - template - CUDA_DEVICE_CALLABLE auto operator()(F&& f, Ts&&... args) - { - f.template operator()(std::forward(args)...); - } -}; - /** * @brief Functor performing a type dispatch for a binary operator. * diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index 89fa7d31980..8a99f1c93a8 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include #include #include @@ -31,36 +31,12 @@ #include -#include - namespace cudf { namespace ast { namespace detail { -// Type trait for wrapping nullable types in a thrust::optional. Non-nullable -// types are returned as is. -template -struct possibly_null_value; - -template -struct possibly_null_value { - using type = thrust::optional; -}; - -template -struct possibly_null_value { - using type = T; -}; - -template -using possibly_null_value_t = typename possibly_null_value::type; - -// Type used for intermediate storage in expression evaluation. -template -using IntermediateDataType = possibly_null_value_t; - /** * @brief A container for capturing the output of an evaluated expression. * @@ -214,147 +190,30 @@ struct mutable_column_expression_result }; /** - * @brief A container of all device data required to evaluate an expression on tables. - * - * This struct should never be instantiated directly. It is created by the - * `ast_plan` on construction, and the resulting member is publicly accessible - * for passing to kernels for constructing an `expression_evaluator`. - * - */ -struct device_ast_plan { - device_span data_references; - device_span literals; - device_span operators; - device_span operator_source_indices; - cudf::size_type num_intermediates; - int shmem_per_thread; -}; - -/** - * @brief Preprocessor for an expression acting on tables to generate data suitable for AST - * expression evaluation on the GPU. - * - * On construction, an AST plan creates a single "packed" host buffer of all - * data arrays that will be necessary to evaluate an expression on a pair of - * tables. This data is copied to a single contiguous device buffer, and - * pointers are generated to the individual components. Because the plan tends - * to be small, this is the most efficient approach for low latency. All the - * data required on the GPU can be accessed via the convenient `dev_plan` - * member struct, which can be used to construct an `expression_evaluator` on - * the device. + * @brief Despite to a binary operator based on a single data type. * - * Note that the resulting device data cannot be used once this class goes out of scope. + * This functor is a dispatcher for binary operations that assumes that both + * operands to a binary operation are of the same type. This assumption is + * encoded in the one non-deducible template parameter LHS, the type of the + * left-hand operand, which is then used as the template parameter for both the + * left and right operands to the binary operator f. */ -struct ast_plan { +struct single_dispatch_binary_operator { /** - * @brief Construct an AST plan for an expression operating on two tables. + * @brief Single-type dispatch to a binary operation. * - * @param expr The expression for which to construct a plan. - * @param left The left table on which the expression acts. - * @param right The right table on which the expression acts. - * @param has_nulls Boolean indicator of whether or not the data contains nulls. - * @param stream Stream view on which to allocate resources and queue execution. - * @param mr Device memory resource used to allocate the returned column's device. - */ - ast_plan(detail::node const& expr, - cudf::table_view left, - cudf::table_view right, - bool has_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : _linearizer(expr, left, right) - { - std::vector sizes; - std::vector data_pointers; - - extract_size_and_pointer(_linearizer.data_references(), sizes, data_pointers); - extract_size_and_pointer(_linearizer.literals(), sizes, data_pointers); - extract_size_and_pointer(_linearizer.operators(), sizes, data_pointers); - extract_size_and_pointer(_linearizer.operator_source_indices(), sizes, data_pointers); - - // Create device buffer - auto const buffer_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0); - auto buffer_offsets = std::vector(sizes.size()); - thrust::exclusive_scan(sizes.cbegin(), sizes.cend(), buffer_offsets.begin(), 0); - - auto h_data_buffer = std::make_unique(buffer_size); - for (unsigned int i = 0; i < data_pointers.size(); ++i) { - std::memcpy(h_data_buffer.get() + buffer_offsets[i], data_pointers[i], sizes[i]); - } - - _device_data_buffer = rmm::device_buffer(h_data_buffer.get(), buffer_size, stream, mr); - - stream.synchronize(); - - // Create device pointers to components of plan - auto device_data_buffer_ptr = static_cast(_device_data_buffer.data()); - dev_plan.data_references = device_span( - reinterpret_cast(device_data_buffer_ptr + - buffer_offsets[0]), - _linearizer.data_references().size()); - dev_plan.literals = device_span( - reinterpret_cast( - device_data_buffer_ptr + buffer_offsets[1]), - _linearizer.literals().size()); - dev_plan.operators = device_span( - reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]), - _linearizer.operators().size()); - dev_plan.operator_source_indices = device_span( - reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]), - _linearizer.operator_source_indices().size()); - dev_plan.num_intermediates = _linearizer.intermediate_count(); - dev_plan.shmem_per_thread = static_cast( - (has_nulls ? sizeof(IntermediateDataType) : sizeof(IntermediateDataType)) * - dev_plan.num_intermediates); - } - - /** - * @brief Construct an AST plan for an expression operating on one table. + * @tparam LHS Left input type. + * @tparam F Type of forwarded binary operator functor. + * @tparam Ts Parameter pack of forwarded arguments. * - * @param expr The expression for which to construct a plan. - * @param table The table on which the expression acts. - * @param has_nulls Boolean indicator of whether or not the data contains nulls. - * @param stream Stream view on which to allocate resources and queue execution. - * @param mr Device memory resource used to allocate the returned column's device. + * @param f Binary operator functor. + * @param args Forwarded arguments to `operator()` of `f`. */ - ast_plan(detail::node const& expr, - cudf::table_view table, - bool has_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : ast_plan(expr, table, table, has_nulls, stream, mr) + template + CUDA_DEVICE_CALLABLE auto operator()(F&& f, Ts&&... args) { + f.template operator()(std::forward(args)...); } - - cudf::data_type output_type() const { return _linearizer.root_data_type(); } - - device_ast_plan - dev_plan; ///< The collection of data required to evaluate the expression on the device. - - private: - /** - * @brief Helper function for adding components (operators, literals, etc) to AST plan - * - * @tparam T The underlying type of the input `std::vector` - * @param[in] v The `std::vector` containing components (operators, literals, etc). - * @param[in,out] sizes The `std::vector` containing the size of each data buffer. - * @param[in,out] data_pointers The `std::vector` containing pointers to each data buffer. - */ - template - void extract_size_and_pointer(std::vector const& v, - std::vector& sizes, - std::vector& data_pointers) - { - auto const data_size = sizeof(T) * v.size(); - sizes.push_back(data_size); - data_pointers.push_back(v.data()); - } - - rmm::device_buffer - _device_data_buffer; ///< The device-side data buffer containing the plan information, which is - ///< owned by this class and persists until it is destroyed. - linearizer const _linearizer; ///< The linearizer created from the provided expression that is - ///< used to construct device-side operators and references. }; /** @@ -379,7 +238,7 @@ struct expression_evaluator { */ __device__ expression_evaluator(table_device_view const& left, table_device_view const& right, - device_ast_plan const& plan, + expression_device_view const& plan, IntermediateDataType* thread_intermediate_storage, null_equality compare_nulls = null_equality::EQUAL) : left(left), @@ -400,7 +259,7 @@ struct expression_evaluator { * @param compare_nulls Whether the equality operator returns true or false for two nulls. */ __device__ expression_evaluator(table_device_view const& table, - device_ast_plan const& plan, + expression_device_view const& plan, IntermediateDataType* thread_intermediate_storage, null_equality compare_nulls = null_equality::EQUAL) : left(table), @@ -484,11 +343,11 @@ struct expression_evaluator { */ template __device__ void operator()(OutputType& output_object, - const cudf::size_type input_row_index, - const detail::device_data_reference input, - const detail::device_data_reference output, - const cudf::size_type output_row_index, - const ast_operator op) const + cudf::size_type const input_row_index, + detail::device_data_reference const input, + detail::device_data_reference const output, + cudf::size_type const output_row_index, + ast_operator const op) const { auto const typed_input = resolve_input(input, input_row_index); ast_operator_dispatcher(op, @@ -517,13 +376,13 @@ struct expression_evaluator { */ template __device__ void operator()(OutputType& output_object, - const cudf::size_type left_row_index, - const cudf::size_type right_row_index, - const detail::device_data_reference lhs, - const detail::device_data_reference rhs, - const detail::device_data_reference output, - const cudf::size_type output_row_index, - const ast_operator op) const + cudf::size_type const left_row_index, + cudf::size_type const right_row_index, + detail::device_data_reference const lhs, + detail::device_data_reference const rhs, + detail::device_data_reference const output, + cudf::size_type const output_row_index, + ast_operator const op) const { auto const typed_lhs = resolve_input(lhs, left_row_index); auto const typed_rhs = resolve_input(rhs, right_row_index); @@ -544,11 +403,11 @@ struct expression_evaluator { __device__ void operator()(OutputType& output_object, cudf::size_type left_row_index, cudf::size_type right_row_index, - const detail::device_data_reference lhs, - const detail::device_data_reference rhs, - const detail::device_data_reference output, + detail::device_data_reference const lhs, + detail::device_data_reference const rhs, + detail::device_data_reference const output, cudf::size_type output_row_index, - const ast_operator op) const + ast_operator const op) const { cudf_assert(false && "Invalid binary dispatch operator for the provided input."); } @@ -670,9 +529,9 @@ struct expression_evaluator { typename OutputType, CUDF_ENABLE_IF(is_rep_layout_compatible())> __device__ void resolve_output(OutputType& output_object, - const detail::device_data_reference device_data_reference, - const cudf::size_type row_index, - const possibly_null_value_t result) const + detail::device_data_reference const device_data_reference, + cudf::size_type const row_index, + possibly_null_value_t const result) const { auto const ref_type = device_data_reference.reference_type; if (ref_type == detail::device_data_reference_type::COLUMN) { @@ -690,9 +549,9 @@ struct expression_evaluator { typename OutputType, CUDF_ENABLE_IF(not is_rep_layout_compatible())> __device__ void resolve_output(OutputType& output_object, - const detail::device_data_reference device_data_reference, - const cudf::size_type row_index, - const possibly_null_value_t result) const + detail::device_data_reference const device_data_reference, + cudf::size_type const row_index, + possibly_null_value_t const result) const { cudf_assert(false && "Invalid type in resolve_output."); } @@ -730,9 +589,9 @@ struct expression_evaluator { typename OutputType, std::enable_if_t, Input>>* = nullptr> __device__ void operator()(OutputType& output_object, - const cudf::size_type output_row_index, - const possibly_null_value_t input, - const detail::device_data_reference output) const + cudf::size_type const output_row_index, + possibly_null_value_t const input, + detail::device_data_reference const output) const { using OperatorFunctor = detail::operator_functor; using Out = cuda::std::invoke_result_t; @@ -752,9 +611,9 @@ struct expression_evaluator { typename OutputType, std::enable_if_t, Input>>* = nullptr> __device__ void operator()(OutputType& output_object, - const cudf::size_type output_row_index, - const possibly_null_value_t input, - const detail::device_data_reference output) const + cudf::size_type const output_row_index, + possibly_null_value_t const input, + detail::device_data_reference const output) const { cudf_assert(false && "Invalid unary dispatch operator for the provided input."); } @@ -790,10 +649,10 @@ struct expression_evaluator { std::enable_if_t< detail::is_valid_binary_op, LHS, RHS>>* = nullptr> __device__ void operator()(OutputType& output_object, - const cudf::size_type output_row_index, - const possibly_null_value_t lhs, - const possibly_null_value_t rhs, - const detail::device_data_reference output) const + cudf::size_type const output_row_index, + possibly_null_value_t const lhs, + possibly_null_value_t const rhs, + detail::device_data_reference const output) const { using OperatorFunctor = detail::operator_functor; using Out = cuda::std::invoke_result_t; @@ -832,10 +691,10 @@ struct expression_evaluator { std::enable_if_t< !detail::is_valid_binary_op, LHS, RHS>>* = nullptr> __device__ void operator()(OutputType& output_object, - const cudf::size_type output_row_index, - const possibly_null_value_t lhs, - const possibly_null_value_t rhs, - const detail::device_data_reference output) const + cudf::size_type const output_row_index, + possibly_null_value_t const lhs, + possibly_null_value_t const rhs, + detail::device_data_reference output) const { cudf_assert(false && "Invalid binary dispatch operator for the provided input."); } @@ -843,7 +702,7 @@ struct expression_evaluator { table_device_view const& left; ///< The left table to operate on. table_device_view const& right; ///< The right table to operate on. - device_ast_plan const& + expression_device_view const& plan; ///< The container of device data representing the expression to evaluate. IntermediateDataType* thread_intermediate_storage; ///< The shared memory store of intermediates produced during diff --git a/cpp/include/cudf/ast/nodes.hpp b/cpp/include/cudf/ast/nodes.hpp index 70dda58816e..f36d7bcd3c7 100644 --- a/cpp/include/cudf/ast/nodes.hpp +++ b/cpp/include/cudf/ast/nodes.hpp @@ -15,7 +15,6 @@ */ #pragma once -#include #include #include #include @@ -26,6 +25,21 @@ namespace cudf { namespace ast { +namespace detail { + +// Forward declaration +class expression_parser; +/** + * @brief A generic node that can be evaluated to return a value. + * + * This class is a part of a "visitor" pattern with the `linearizer` class. + * Nodes inheriting from this class can accept visitors. + */ +struct node { + virtual cudf::size_type accept(expression_parser& visitor) const = 0; +}; + +} // namespace detail /** * @brief Enum of table references. @@ -96,10 +110,10 @@ class literal : public detail::node { * @param visitor Visitor. * @return cudf::size_type Index of device data reference for this instance. */ - cudf::size_type accept(detail::linearizer& visitor) const override; + cudf::size_type accept(detail::expression_parser& visitor) const override; private: - const cudf::detail::fixed_width_scalar_device_view_base value; + cudf::detail::fixed_width_scalar_device_view_base const value; }; /** @@ -140,7 +154,7 @@ class column_reference : public detail::node { * @param table Table used to determine types. * @return cudf::data_type */ - cudf::data_type get_data_type(const table_view& table) const + cudf::data_type get_data_type(table_view const& table) const { return table.column(get_column_index()).type(); } @@ -152,9 +166,9 @@ class column_reference : public detail::node { * @param right_table Right table used to determine types. * @return cudf::data_type */ - cudf::data_type get_data_type(const table_view& left_table, const table_view& right_table) const + cudf::data_type get_data_type(table_view const& left_table, table_view const& right_table) const { - const auto table = [&] { + auto const table = [&] { if (get_table_source() == table_reference::LEFT) { return left_table; } else if (get_table_source() == table_reference::RIGHT) { @@ -172,7 +186,7 @@ class column_reference : public detail::node { * @param visitor Visitor. * @return cudf::size_type Index of device data reference for this instance. */ - cudf::size_type accept(detail::linearizer& visitor) const override; + cudf::size_type accept(detail::expression_parser& visitor) const override; private: cudf::size_type column_index; @@ -230,7 +244,7 @@ class expression : public detail::node { * * @return std::vector> */ - std::vector> get_operands() const { return operands; } + std::vector> get_operands() const { return operands; } /** * @brief Accepts a visitor class. @@ -238,11 +252,11 @@ class expression : public detail::node { * @param visitor Visitor. * @return cudf::size_type Index of device data reference for this instance. */ - cudf::size_type accept(detail::linearizer& visitor) const override; + cudf::size_type accept(detail::expression_parser& visitor) const override; private: - const ast_operator op; - const std::vector> operands; + ast_operator const op; + std::vector> const operands; }; } // namespace ast diff --git a/cpp/src/ast/linearizer.cpp b/cpp/src/ast/expression_parser.cpp similarity index 81% rename from cpp/src/ast/linearizer.cpp rename to cpp/src/ast/expression_parser.cpp index 3e442305552..66d72fbb454 100644 --- a/cpp/src/ast/linearizer.cpp +++ b/cpp/src/ast/expression_parser.cpp @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include +#include #include #include #include @@ -56,7 +56,7 @@ device_data_reference::device_data_reference(device_data_reference_type referenc { } -cudf::size_type linearizer::intermediate_counter::take() +cudf::size_type expression_parser::intermediate_counter::take() { auto const first_missing = find_first_missing(); used_values.insert(used_values.cbegin() + first_missing, first_missing); @@ -64,7 +64,7 @@ cudf::size_type linearizer::intermediate_counter::take() return first_missing; } -void linearizer::intermediate_counter::give(cudf::size_type value) +void expression_parser::intermediate_counter::give(cudf::size_type value) { // TODO: add comment auto const lower_bound = std::lower_bound(used_values.cbegin(), used_values.cend(), value); @@ -72,18 +72,7 @@ void linearizer::intermediate_counter::give(cudf::size_type value) used_values.erase(lower_bound); } -/** - * @brief Find the first missing value in a contiguous sequence of integers. - * - * From a sorted container of integers, find the first "missing" value. - * For example, {0, 1, 2, 4, 5} is missing 3, and {1, 2, 3} is missing 0. - * If there are no missing values, return the size of the container. - * - * @param start Starting index. - * @param end Ending index. - * @return cudf::size_type Smallest value not already in the container. - */ -cudf::size_type linearizer::intermediate_counter::find_first_missing() const +cudf::size_type expression_parser::intermediate_counter::find_first_missing() const { if (used_values.empty() || (used_values.front() != 0)) { return 0; } // Search for the first non-contiguous pair of elements. @@ -94,7 +83,7 @@ cudf::size_type linearizer::intermediate_counter::find_first_missing() const : used_values.size(); // No missing elements. Return the next element in the sequence. } -cudf::size_type linearizer::visit(literal const& expr) +cudf::size_type expression_parser::visit(literal const& expr) { _node_count++; // Increment the node index auto const data_type = expr.get_data_type(); // Resolve node type @@ -106,7 +95,7 @@ cudf::size_type linearizer::visit(literal const& expr) return add_data_reference(source); } -cudf::size_type linearizer::visit(column_reference const& expr) +cudf::size_type expression_parser::visit(column_reference const& expr) { // Increment the node index _node_count++; @@ -122,14 +111,14 @@ cudf::size_type linearizer::visit(column_reference const& expr) return add_data_reference(source); } -cudf::size_type linearizer::visit(expression const& expr) +cudf::size_type expression_parser::visit(expression const& expr) { // Increment the node index auto const node_index = _node_count++; // Visit children (operands) of this node auto const operand_data_ref_indices = visit_operands(expr.get_operands()); // Resolve operand types - auto data_ref = [this](auto const& index) { return data_references()[index].data_type; }; + auto data_ref = [this](auto const& index) { return _data_references[index].data_type; }; auto begin = thrust::make_transform_iterator(operand_data_ref_indices.cbegin(), data_ref); auto end = begin + operand_data_ref_indices.size(); auto const operand_types = std::vector(begin, end); @@ -145,7 +134,7 @@ cudf::size_type linearizer::visit(expression const& expr) operand_data_ref_indices.cbegin(), operand_data_ref_indices.cend(), [this](auto const& data_reference_index) { - auto const operand_source = data_references()[data_reference_index]; + auto const operand_source = _data_references[data_reference_index]; if (operand_source.reference_type == detail::device_data_reference_type::INTERMEDIATE) { auto const intermediate_index = operand_source.data_index; _intermediate_counter.give(intermediate_index); @@ -167,7 +156,8 @@ cudf::size_type linearizer::visit(expression const& expr) if (!cudf::is_fixed_width(data_type)) { CUDF_FAIL( "The output data type is not a fixed-width type but must be stored in an intermediate."); - } else if (cudf::size_of(data_type) > sizeof(std::int64_t)) { + } else if (cudf::size_of(data_type) > (_has_nulls ? sizeof(IntermediateDataType) + : sizeof(IntermediateDataType))) { CUDF_FAIL("The output data type is too large to be stored in an intermediate."); } return detail::device_data_reference( @@ -183,14 +173,14 @@ cudf::size_type linearizer::visit(expression const& expr) return index; } -cudf::data_type linearizer::root_data_type() const +cudf::data_type expression_parser::output_type() const { - return data_references().empty() ? cudf::data_type(cudf::type_id::EMPTY) - : data_references().back().data_type; + return _data_references.empty() ? cudf::data_type(cudf::type_id::EMPTY) + : _data_references.back().data_type; } -std::vector linearizer::visit_operands( - std::vector> operands) +std::vector expression_parser::visit_operands( + std::vector> operands) { auto operand_data_reference_indices = std::vector(); for (auto const& operand : operands) { @@ -200,7 +190,7 @@ std::vector linearizer::visit_operands( return operand_data_reference_indices; } -cudf::size_type linearizer::add_data_reference(detail::device_data_reference data_ref) +cudf::size_type expression_parser::add_data_reference(detail::device_data_reference data_ref) { // If an equivalent data reference already exists, return its index. Otherwise add this data // reference and return the new index. @@ -215,12 +205,15 @@ cudf::size_type linearizer::add_data_reference(detail::device_data_reference dat } // namespace detail -cudf::size_type literal::accept(detail::linearizer& visitor) const { return visitor.visit(*this); } -cudf::size_type column_reference::accept(detail::linearizer& visitor) const +cudf::size_type literal::accept(detail::expression_parser& visitor) const +{ + return visitor.visit(*this); +} +cudf::size_type column_reference::accept(detail::expression_parser& visitor) const { return visitor.visit(*this); } -cudf::size_type expression::accept(detail::linearizer& visitor) const +cudf::size_type expression::accept(detail::expression_parser& visitor) const { return visitor.visit(*this); } diff --git a/cpp/src/ast/transform.cu b/cpp/src/ast/transform.cu index d6426f92002..4656d4b48c0 100644 --- a/cpp/src/ast/transform.cu +++ b/cpp/src/ast/transform.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -47,13 +48,14 @@ namespace detail { * @tparam has_nulls whether or not the output column may contain nulls. * * @param table The table device view used for evaluation. - * @param plan Container of device data required to evaluate the desired expression. + * @param device_expression_data Container of device data required to evaluate the desired + * expression. * @param output_column The destination for the results of evaluating the expression. */ template __launch_bounds__(max_block_size) __global__ void compute_column_kernel(table_device_view const table, - device_ast_plan plan, + ast::detail::expression_device_view device_expression_data, mutable_column_device_view output_column) { // The (required) extern storage of the shared memory array leads to @@ -64,11 +66,12 @@ __launch_bounds__(max_block_size) __global__ IntermediateDataType* intermediate_storage = reinterpret_cast*>(raw_intermediate_storage); - auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * plan.num_intermediates]; + auto thread_intermediate_storage = + &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; auto const start_idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); auto const stride = static_cast(blockDim.x * gridDim.x); - auto evaluator = - cudf::ast::detail::expression_evaluator(table, plan, thread_intermediate_storage); + auto evaluator = cudf::ast::detail::expression_evaluator( + table, device_expression_data, thread_intermediate_storage); for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) { auto output_dest = mutable_column_expression_result(output_column); @@ -89,19 +92,19 @@ std::unique_ptr compute_column(table_view const table, auto const nullable = cudf::nullable(table); auto const has_nulls = nullable && cudf::has_nulls(table); - auto const plan = ast_plan{expr, table, has_nulls, stream, mr}; + auto const parser = ast::detail::expression_parser{expr, table, has_nulls, stream, mr}; auto const output_column_mask_state = nullable ? (has_nulls ? mask_state::UNINITIALIZED : mask_state::ALL_VALID) : mask_state::UNALLOCATED; auto output_column = cudf::make_fixed_width_column( - plan.output_type(), table.num_rows(), output_column_mask_state, stream, mr); + parser.output_type(), table.num_rows(), output_column_mask_state, stream, mr); auto mutable_output_device = cudf::mutable_column_device_view::create(output_column->mutable_view(), stream); // Configure kernel parameters - auto const& dev_plan = plan.dev_plan; + auto const& device_expression_data = parser.device_expression_data; int device_id; CUDA_TRY(cudaGetDevice(&device_id)); int shmem_limit_per_block; @@ -109,22 +112,23 @@ std::unique_ptr compute_column(table_view const table, cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); auto constexpr MAX_BLOCK_SIZE = 128; auto const block_size = - dev_plan.shmem_per_thread != 0 - ? std::min(MAX_BLOCK_SIZE, shmem_limit_per_block / dev_plan.shmem_per_thread) + device_expression_data.shmem_per_thread != 0 + ? std::min(MAX_BLOCK_SIZE, shmem_limit_per_block / device_expression_data.shmem_per_thread) : MAX_BLOCK_SIZE; - auto const config = cudf::detail::grid_1d{table.num_rows(), block_size}; - auto const shmem_per_block = dev_plan.shmem_per_thread * config.num_threads_per_block; + auto const config = cudf::detail::grid_1d{table.num_rows(), block_size}; + auto const shmem_per_block = + device_expression_data.shmem_per_thread * config.num_threads_per_block; // Execute the kernel auto table_device = table_device_view::create(table, stream); if (has_nulls) { cudf::ast::detail::compute_column_kernel <<>>( - *table_device, dev_plan, *mutable_output_device); + *table_device, device_expression_data, *mutable_output_device); } else { cudf::ast::detail::compute_column_kernel <<>>( - *table_device, dev_plan, *mutable_output_device); + *table_device, device_expression_data, *mutable_output_device); } CHECK_CUDA(stream.value()); return output_column; diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index c7a1630311b..1538780db5e 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -37,7 +37,7 @@ conditional_join(table_view left, { CUDF_FUNC_RANGE(); return get_conditional_join_indices( - left, right, JoinKind, binary_predicate, compare_nulls, stream, mr); + left, right, binary_predicate, compare_nulls, JoinKind, stream, mr); } } // namespace detail diff --git a/cpp/src/join/conditional_join.cuh b/cpp/src/join/conditional_join.cuh index 4602b7fefaa..6794fc89e9e 100644 --- a/cpp/src/join/conditional_join.cuh +++ b/cpp/src/join/conditional_join.cuh @@ -50,9 +50,9 @@ std::pair>, std::unique_ptr>> get_conditional_join_indices(table_view const& left, table_view const& right, - join_kind JoinKind, - ast::expression binary_pred, + ast::expression binary_predicate, null_equality compare_nulls, + join_kind JoinKind, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -83,8 +83,9 @@ get_conditional_join_indices(table_view const& left, auto const nullable = cudf::nullable(left) || cudf::nullable(right); auto const has_nulls = nullable && (cudf::has_nulls(left) || cudf::has_nulls(right)); - auto const plan = ast::detail::ast_plan{binary_pred, left, right, has_nulls, stream, mr}; - CUDF_EXPECTS(plan.output_type().id() == type_id::BOOL8, + auto const parser = + ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr}; + CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8, "The expression must produce a boolean output."); auto left_table = table_device_view::create(left, stream); @@ -95,7 +96,8 @@ get_conditional_join_indices(table_view const& left, CHECK_CUDA(stream.value()); constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; detail::grid_1d config(left_table->num_rows(), block_size); - auto const shmem_size_per_block = plan.dev_plan.shmem_per_thread * config.num_threads_per_block; + auto const shmem_size_per_block = + parser.device_expression_data.shmem_per_thread * config.num_threads_per_block; // Determine number of output rows without actually building the output to simply // find what the size of the output will be. @@ -103,11 +105,21 @@ get_conditional_join_indices(table_view const& left, if (has_nulls) { compute_conditional_join_output_size <<>>( - *left_table, *right_table, KernelJoinKind, compare_nulls, plan.dev_plan, size.data()); + *left_table, + *right_table, + KernelJoinKind, + compare_nulls, + parser.device_expression_data, + size.data()); } else { compute_conditional_join_output_size <<>>( - *left_table, *right_table, KernelJoinKind, compare_nulls, plan.dev_plan, size.data()); + *left_table, + *right_table, + KernelJoinKind, + compare_nulls, + parser.device_expression_data, + size.data()); } CHECK_CUDA(stream.value()); @@ -124,8 +136,8 @@ get_conditional_join_indices(table_view const& left, auto left_indices = std::make_unique>(join_size, stream, mr); auto right_indices = std::make_unique>(join_size, stream, mr); - const auto& join_output_l = left_indices->data(); - const auto& join_output_r = right_indices->data(); + auto const& join_output_l = left_indices->data(); + auto const& join_output_r = right_indices->data(); if (has_nulls) { conditional_join <<>>( @@ -136,7 +148,7 @@ get_conditional_join_indices(table_view const& left, join_output_l, join_output_r, write_index.data(), - plan.dev_plan, + parser.device_expression_data, join_size); } else { conditional_join @@ -148,7 +160,7 @@ get_conditional_join_indices(table_view const& left, join_output_l, join_output_r, write_index.data(), - plan.dev_plan, + parser.device_expression_data, join_size); } diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh index 3d34a49c5af..c6edf049d5d 100644 --- a/cpp/src/join/conditional_join_kernels.cuh +++ b/cpp/src/join/conditional_join_kernels.cuh @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include #include @@ -42,16 +42,18 @@ namespace detail { * @param[in] right_table The right table * @param[in] JoinKind The type of join to be performed * @param[in] compare_nulls Controls whether null join-key values should match or not. - * @param[in] plan Container of device data required to evaluate the desired expression. + * @param[in] device_expression_data Container of device data required to evaluate the desired + * expression. * @param[out] output_size The resulting output size */ template -__global__ void compute_conditional_join_output_size(table_device_view left_table, - table_device_view right_table, - join_kind JoinKind, - null_equality compare_nulls, - ast::detail::device_ast_plan plan, - cudf::size_type* output_size) +__global__ void compute_conditional_join_output_size( + table_device_view left_table, + table_device_view right_table, + join_kind JoinKind, + null_equality compare_nulls, + ast::detail::expression_device_view device_expression_data, + cudf::size_type* output_size) { // The (required) extern storage of the shared memory array leads to // conflicting declarations between different templates. The easiest @@ -60,16 +62,17 @@ __global__ void compute_conditional_join_output_size(table_device_view left_tabl extern __shared__ char raw_intermediate_storage[]; cudf::ast::detail::IntermediateDataType* intermediate_storage = reinterpret_cast*>(raw_intermediate_storage); - auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * plan.num_intermediates]; + auto thread_intermediate_storage = + &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; cudf::size_type thread_counter(0); - const cudf::size_type left_start_idx = threadIdx.x + blockIdx.x * blockDim.x; - const cudf::size_type left_stride = blockDim.x * gridDim.x; - const cudf::size_type left_num_rows = left_table.num_rows(); - const cudf::size_type right_num_rows = right_table.num_rows(); + cudf::size_type const left_start_idx = threadIdx.x + blockIdx.x * blockDim.x; + cudf::size_type const left_stride = blockDim.x * gridDim.x; + cudf::size_type const left_num_rows = left_table.num_rows(); + cudf::size_type const right_num_rows = right_table.num_rows(); auto evaluator = cudf::ast::detail::expression_evaluator( - left_table, right_table, plan, thread_intermediate_storage, compare_nulls); + left_table, right_table, device_expression_data, thread_intermediate_storage, compare_nulls); for (cudf::size_type left_row_index = left_start_idx; left_row_index < left_num_rows; left_row_index += left_stride) { @@ -118,7 +121,8 @@ __global__ void compute_conditional_join_output_size(table_device_view left_tabl * @param[out] join_output_r The right result of the join operation * @param[in,out] current_idx A global counter used by threads to coordinate * writes to the global output - * @param plan Container of device data required to evaluate the desired expression. + * @param device_expression_data Container of device data required to evaluate the desired + * expression. * @param[in] max_size The maximum size of the output */ template @@ -129,8 +133,8 @@ __global__ void conditional_join(table_device_view left_table, cudf::size_type* join_output_l, cudf::size_type* join_output_r, cudf::size_type* current_idx, - cudf::ast::detail::device_ast_plan plan, - const cudf::size_type max_size) + cudf::ast::detail::expression_device_view device_expression_data, + cudf::size_type const max_size) { constexpr int num_warps = block_size / detail::warp_size; __shared__ cudf::size_type current_idx_shared[num_warps]; @@ -144,12 +148,13 @@ __global__ void conditional_join(table_device_view left_table, extern __shared__ char raw_intermediate_storage[]; cudf::ast::detail::IntermediateDataType* intermediate_storage = reinterpret_cast*>(raw_intermediate_storage); - auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * plan.num_intermediates]; + auto thread_intermediate_storage = + &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; - const int warp_id = threadIdx.x / detail::warp_size; - const int lane_id = threadIdx.x % detail::warp_size; - const cudf::size_type left_num_rows = left_table.num_rows(); - const cudf::size_type right_num_rows = right_table.num_rows(); + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + cudf::size_type const left_num_rows = left_table.num_rows(); + cudf::size_type const right_num_rows = right_table.num_rows(); if (0 == lane_id) { current_idx_shared[warp_id] = 0; } @@ -157,10 +162,10 @@ __global__ void conditional_join(table_device_view left_table, cudf::size_type left_row_index = threadIdx.x + blockIdx.x * blockDim.x; - const unsigned int activemask = __ballot_sync(0xffffffff, left_row_index < left_num_rows); + unsigned int const activemask = __ballot_sync(0xffffffff, left_row_index < left_num_rows); auto evaluator = cudf::ast::detail::expression_evaluator( - left_table, right_table, plan, thread_intermediate_storage, compare_nulls); + left_table, right_table, device_expression_data, thread_intermediate_storage, compare_nulls); if (left_row_index < left_num_rows) { bool found_match = false;