Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Python bindings for string literal support in AST #13073

Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
982af8a
string scalar support in AST - proof of concept
karthikeyann Mar 30, 2023
0a9eb86
Add cudf::ast::generic_scalar_device_view
karthikeyann Apr 4, 2023
50ee55d
remove filter by range example from test code
karthikeyann Apr 4, 2023
9735d51
cleanup docs
karthikeyann Apr 4, 2023
8653e61
Merge branch 'branch-23.06' of github.com:rapidsai/cudf into fea-stri…
karthikeyann Apr 4, 2023
7ad5c5d
add cython bindings, unit tests for string literal support in AST
karthikeyann Apr 5, 2023
3a40c31
Apply suggestions from code review
karthikeyann Apr 18, 2023
24b6589
Merge branch 'branch-23.06' into fea-cython-string_scalar_ast_compare
karthikeyann Apr 18, 2023
3405241
Merge branch 'branch-23.06' into fea-cython-string_scalar_ast_compare
karthikeyann Apr 20, 2023
4c44afb
cleanup cython Literal, update docs
karthikeyann Apr 21, 2023
74cd710
Merge branch 'branch-23.06' of github.com:rapidsai/cudf into fea-cyth…
karthikeyann Apr 21, 2023
5d80737
Merge branch 'branch-23.06' into fea-cython-string_scalar_ast_compare
karthikeyann Apr 25, 2023
c3d8b66
Merge branch 'branch-23.06' into fea-cython-string_scalar_ast_compare
karthikeyann Apr 26, 2023
215d4db
Merge branch 'branch-23.06' into fea-cython-string_scalar_ast_compare
karthikeyann May 2, 2023
fa31360
Update python/cudf/cudf/core/dataframe.py
vyasr May 3, 2023
ab9fd3d
Merge branch 'branch-23.06' into fea-cython-string_scalar_ast_compare
vyasr May 3, 2023
30c4b96
Merge branch 'branch-23.06' into fea-cython-string_scalar_ast_compare
karthikeyann May 5, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions cpp/include/cudf/ast/detail/expression_evaluator.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -21,7 +21,6 @@
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/utilities/assert.cuh>
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/table/table_device_view.cuh>
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>
Expand Down
15 changes: 7 additions & 8 deletions cpp/include/cudf/ast/detail/expression_parser.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -104,7 +104,7 @@ using IntermediateDataType = possibly_null_value_t<std::int64_t, has_nulls>;
*/
struct expression_device_view {
device_span<detail::device_data_reference const> data_references;
device_span<cudf::detail::fixed_width_scalar_device_view_base const> literals;
device_span<generic_scalar_device_view const> literals;
device_span<ast_operator const> operators;
device_span<cudf::size_type const> operator_source_indices;
cudf::size_type num_intermediates;
Expand Down Expand Up @@ -281,11 +281,10 @@ class expression_parser {
reinterpret_cast<detail::device_data_reference const*>(device_data_buffer_ptr +
buffer_offsets[0]),
_data_references.size());
device_expression_data.literals =
device_span<cudf::detail::fixed_width_scalar_device_view_base const>(
reinterpret_cast<cudf::detail::fixed_width_scalar_device_view_base const*>(
device_data_buffer_ptr + buffer_offsets[1]),
_literals.size());
device_expression_data.literals = device_span<generic_scalar_device_view const>(
reinterpret_cast<generic_scalar_device_view const*>(device_data_buffer_ptr +
buffer_offsets[1]),
_literals.size());
device_expression_data.operators = device_span<ast_operator const>(
reinterpret_cast<ast_operator const*>(device_data_buffer_ptr + buffer_offsets[2]),
_operators.size());
Expand Down Expand Up @@ -335,7 +334,7 @@ class expression_parser {
std::vector<detail::device_data_reference> _data_references;
std::vector<ast_operator> _operators;
std::vector<cudf::size_type> _operator_source_indices;
std::vector<cudf::detail::fixed_width_scalar_device_view_base> _literals;
std::vector<generic_scalar_device_view> _literals;
};

} // namespace detail
Expand Down
115 changes: 103 additions & 12 deletions cpp/include/cudf/ast/expressions.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -150,6 +150,96 @@ enum class table_reference {
OUTPUT ///< Column index in the output table
};

/**
* @brief A type-erased scalar_device_view where the value is a fixed width type or a string
*/
class generic_scalar_device_view : public cudf::detail::scalar_device_view_base {
public:
/**
* @brief Returns the stored value.
*
* @tparam T The desired type
* @returns The stored value
*/
template <typename T>
__device__ T const value() const noexcept
{
if constexpr (std::is_same_v<T, cudf::string_view>) {
return string_view(static_cast<char const*>(_data), _size);
}
return *static_cast<T const*>(_data);
}

/** @brief Construct a new generic scalar device view object from a numeric scalar
*
* @param s The numeric scalar to construct from
*/
template <typename T>
generic_scalar_device_view(numeric_scalar<T>& s)
: generic_scalar_device_view(s.type(), s.data(), s.validity_data())
{
}

/** @brief Construct a new generic scalar device view object from a timestamp scalar
*
* @param s The timestamp scalar to construct from
*/
template <typename T>
generic_scalar_device_view(timestamp_scalar<T>& s)
: generic_scalar_device_view(s.type(), s.data(), s.validity_data())
{
}

/** @brief Construct a new generic scalar device view object from a duration scalar
*
* @param s The duration scalar to construct from
*/
template <typename T>
generic_scalar_device_view(duration_scalar<T>& s)
: generic_scalar_device_view(s.type(), s.data(), s.validity_data())
{
}

/** @brief Construct a new generic scalar device view object from a string scalar
*
* @param s The string scalar to construct from
*/
generic_scalar_device_view(string_scalar& s)
: generic_scalar_device_view(s.type(), s.data(), s.validity_data(), s.size())
{
}

protected:
void const* _data{}; ///< Pointer to device memory containing the value
size_type const _size{}; ///< Size of the string in bytes for string scalar

/**
* @brief Construct a new fixed width scalar device view object
*
* @param type The data type of the value
* @param data The pointer to the data in device memory
* @param is_valid The pointer to the bool in device memory that indicates the
* validity of the stored value
*/
generic_scalar_device_view(data_type type, void const* data, bool* is_valid)
: cudf::detail::scalar_device_view_base(type, is_valid), _data(data)
{
}

/** @brief Construct a new string scalar device view object
*
* @param type The data type of the value
* @param data The pointer to the data in device memory
* @param is_valid The pointer to the bool in device memory that indicates the
* validity of the stored value
* @param size The size of the string in bytes
*/
generic_scalar_device_view(data_type type, void const* data, bool* is_valid, size_type size)
: cudf::detail::scalar_device_view_base(type, is_valid), _data(data), _size(size)
{
}
};

/**
* @brief A literal value used in an abstract syntax tree.
*/
Expand All @@ -162,8 +252,7 @@ class literal : public expression {
* @param value A numeric scalar value
*/
template <typename T>
literal(cudf::numeric_scalar<T>& value)
: scalar(value), value(cudf::get_scalar_device_view(value))
literal(cudf::numeric_scalar<T>& value) : scalar(value), value(value)
{
}

Expand All @@ -174,8 +263,7 @@ class literal : public expression {
* @param value A timestamp scalar value
*/
template <typename T>
literal(cudf::timestamp_scalar<T>& value)
: scalar(value), value(cudf::get_scalar_device_view(value))
literal(cudf::timestamp_scalar<T>& value) : scalar(value), value(value)
{
}

Expand All @@ -186,11 +274,17 @@ class literal : public expression {
* @param value A duration scalar value
*/
template <typename T>
literal(cudf::duration_scalar<T>& value)
: scalar(value), value(cudf::get_scalar_device_view(value))
literal(cudf::duration_scalar<T>& value) : scalar(value), value(value)
{
}

/**
* @brief Construct a new literal object.
*
* @param value A string scalar value
*/
literal(cudf::string_scalar& value) : scalar(value), value(value) {}

/**
* @brief Get the data type.
*
Expand All @@ -203,10 +297,7 @@ class literal : public expression {
*
* @return The device scalar object
*/
[[nodiscard]] cudf::detail::fixed_width_scalar_device_view_base get_value() const
{
return value;
}
[[nodiscard]] generic_scalar_device_view get_value() const { return value; }

/**
* @brief Accepts a visitor class.
Expand Down Expand Up @@ -236,7 +327,7 @@ class literal : public expression {

private:
cudf::scalar const& scalar;
cudf::detail::fixed_width_scalar_device_view_base const value;
generic_scalar_device_view const value;
};

/**
Expand Down
36 changes: 35 additions & 1 deletion cpp/tests/ast/transform_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -397,6 +397,40 @@ TEST_F(TransformTest, StringComparison)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
}

TEST_F(TransformTest, StringScalarComparison)
{
auto c_0 = cudf::test::strings_column_wrapper({"1", "12", "123", "23"});
auto table = cudf::table_view{{c_0}};

auto literal_value = cudf::string_scalar("2");
auto literal = cudf::ast::literal(literal_value);

auto col_ref_0 = cudf::ast::column_reference(0);
auto expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal);

auto expected = column_wrapper<bool>{true, true, true, false};
auto result = cudf::compute_column(table, expression);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
}

TEST_F(TransformTest, NumericScalarComparison)
{
auto c_0 = column_wrapper<int32_t>{1, 12, 123, 23};
auto table = cudf::table_view{{c_0}};

auto literal_value = cudf::numeric_scalar<int32_t>(2);
auto literal = cudf::ast::literal(literal_value);

auto col_ref_0 = cudf::ast::column_reference(0);
auto expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal);

auto expected = column_wrapper<bool>{true, false, false, false};
auto result = cudf::compute_column(table, expression);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
}

TEST_F(TransformTest, CopyColumn)
{
auto c_0 = column_wrapper<int32_t>{3, 0, 1, 50};
Expand Down
10 changes: 6 additions & 4 deletions python/cudf/cudf/_lib/expressions.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2023, NVIDIA CORPORATION.

from libc.stdint cimport int32_t, int64_t
from libcpp.memory cimport unique_ptr
Expand All @@ -9,16 +9,18 @@ from cudf._lib.cpp.expressions cimport (
literal,
operation,
)
from cudf._lib.cpp.scalar.scalar cimport numeric_scalar
from cudf._lib.cpp.scalar.scalar cimport numeric_scalar, string_scalar

ctypedef enum scalar_type_t:
INT
DOUBLE
STRING


ctypedef union int_or_double_scalar_ptr:
ctypedef union int_or_double_or_string_scalar_ptr:
unique_ptr[numeric_scalar[int64_t]] int_ptr
unique_ptr[numeric_scalar[double]] double_ptr
unique_ptr[string_scalar] string_ptr


cdef class Expression:
Expand All @@ -27,7 +29,7 @@ cdef class Expression:

cdef class Literal(Expression):
cdef scalar_type_t c_scalar_type
cdef int_or_double_scalar_ptr c_scalar
cdef int_or_double_or_string_scalar_ptr c_scalar


cdef class ColumnReference(Expression):
Expand Down
13 changes: 12 additions & 1 deletion python/cudf/cudf/_lib/expressions.pyx
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2023, NVIDIA CORPORATION.

from enum import Enum

from cython.operator cimport dereference
from libc.stdint cimport int64_t
from libcpp.memory cimport make_unique, unique_ptr
from libcpp.string cimport string

from cudf._lib.cpp cimport expressions as libcudf_exp
from cudf._lib.cpp.types cimport size_type
Expand Down Expand Up @@ -80,6 +81,7 @@ cdef class Literal(Expression):
# TODO: Would love to find a better solution than unions for literals.
cdef int intval
cdef double doubleval
cdef string stringval
vyasr marked this conversation as resolved.
Show resolved Hide resolved

if isinstance(value, int):
self.c_scalar_type = scalar_type_t.INT
Expand All @@ -99,6 +101,15 @@ cdef class Literal(Expression):
self.c_obj = <expression_ptr> make_unique[libcudf_exp.literal](
<numeric_scalar[double] &>dereference(self.c_scalar.double_ptr)
)
elif isinstance(value, str):
self.c_scalar_type = scalar_type_t.STRING
stringval = value.encode()
self.c_scalar.string_ptr = make_unique[string_scalar](
stringval, True
)
self.c_obj = <expression_ptr> make_unique[libcudf_exp.literal](
<string_scalar &>dereference(self.c_scalar.string_ptr)
)


cdef class ColumnReference(Expression):
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/_internals/expressions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2023, NVIDIA CORPORATION.

import ast
import functools
Expand Down Expand Up @@ -115,7 +115,7 @@ def visit_Name(self, node):
self.stack.append(ColumnReference(col_id))

def visit_Constant(self, node):
if not isinstance(node, ast.Num):
if not isinstance(node, ast.Num) and not isinstance(node, ast.Str):
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(
f"Unsupported literal {repr(node.value)} of type "
"{type(node.value).__name__}"
Expand Down
3 changes: 3 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9806,6 +9806,9 @@ def df_eval():
float,
),
("a_b_are_equal = (a == b)", int),
("a > b", str),
("a < '1'", str),
('a == "1"', str),
vyasr marked this conversation as resolved.
Show resolved Hide resolved
],
)
def test_dataframe_eval(df_eval, expr, dtype):
Expand Down