From 9ecaa599b909ac848e4ae44438d836887460235a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Apr 2021 09:33:47 -0700 Subject: [PATCH 01/69] Expose ast operator and reference enums to Python. --- python/cudf/cudf/_lib/ast.pyx | 72 +++++++++++++++++++++++++++++++ python/cudf/cudf/_lib/cpp/ast.pxd | 72 +++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 python/cudf/cudf/_lib/ast.pyx create mode 100644 python/cudf/cudf/_lib/cpp/ast.pxd diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx new file mode 100644 index 00000000000..a089d13b042 --- /dev/null +++ b/python/cudf/cudf/_lib/ast.pyx @@ -0,0 +1,72 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +import numpy as np +from enum import Enum + +# from libc.stdint cimport uint32_t +# from libcpp cimport bool as cbool +# from libcpp.memory cimport unique_ptr +# from libcpp.utility cimport move +# +# from cudf._lib.column cimport Column +# from cudf._lib.replace import replace_nulls +# +# from cudf._lib.cpp.labeling cimport inclusive +# from cudf._lib.cpp.labeling cimport label_bins as cpp_label_bins +# from cudf._lib.cpp.column.column cimport column +# from cudf._lib.cpp.column.column_view cimport column_view + +cimport cudf._lib.cpp.ast as libcudf_ast + + +class ASTOperator(Enum): + ADD = libcudf_ast.ast_operator.ADD + SUB = libcudf_ast.ast_operator.SUB + MUL = libcudf_ast.ast_operator.MUL + DIV = libcudf_ast.ast_operator.DIV + TRUE_DIV = libcudf_ast.ast_operator.TRUE_DIV + FLOOR_DIV = libcudf_ast.ast_operator.FLOOR_DIV + MOD = libcudf_ast.ast_operator.MOD + PYMOD = libcudf_ast.ast_operator.PYMOD + POW = libcudf_ast.ast_operator.POW + EQUAL = libcudf_ast.ast_operator.EQUAL + NOT_EQUAL = libcudf_ast.ast_operator.NOT_EQUAL + LESS = libcudf_ast.ast_operator.LESS + GREATER = libcudf_ast.ast_operator.GREATER + LESS_EQUAL = libcudf_ast.ast_operator.LESS_EQUAL + GREATER_EQUAL = libcudf_ast.ast_operator.GREATER_EQUAL + BITWISE_AND = libcudf_ast.ast_operator.BITWISE_AND + BITWISE_OR = libcudf_ast.ast_operator.BITWISE_OR + BITWISE_XOR = libcudf_ast.ast_operator.BITWISE_XOR + LOGICAL_AND = libcudf_ast.ast_operator.LOGICAL_AND + LOGICAL_OR = libcudf_ast.ast_operator.LOGICAL_OR + # Unary operators + IDENTITY = libcudf_ast.ast_operator.IDENTITY + SIN = libcudf_ast.ast_operator.SIN + COS = libcudf_ast.ast_operator.COS + TAN = libcudf_ast.ast_operator.TAN + ARCSIN = libcudf_ast.ast_operator.ARCSIN + ARCCOS = libcudf_ast.ast_operator.ARCCOS + ARCTAN = libcudf_ast.ast_operator.ARCTAN + SINH = libcudf_ast.ast_operator.SINH + COSH = libcudf_ast.ast_operator.COSH + TANH = libcudf_ast.ast_operator.TANH + ARCSINH = libcudf_ast.ast_operator.ARCSINH + ARCCOSH = libcudf_ast.ast_operator.ARCCOSH + ARCTANH = libcudf_ast.ast_operator.ARCTANH + EXP = libcudf_ast.ast_operator.EXP + LOG = libcudf_ast.ast_operator.LOG + SQRT = libcudf_ast.ast_operator.SQRT + CBRT = libcudf_ast.ast_operator.CBRT + CEIL = libcudf_ast.ast_operator.CEIL + FLOOR = libcudf_ast.ast_operator.FLOOR + ABS = libcudf_ast.ast_operator.ABS + RINT = libcudf_ast.ast_operator.RINT + BIT_INVERT = libcudf_ast.ast_operator.BIT_INVERT + NOT = libcudf_ast.ast_operator.NOT + + +class TableReference(Enum): + LEFT = libcudf_ast.table_reference.LEFT + RIGHT = libcudf_ast.table_reference.RIGHT + OUTPUT = libcudf_ast.table_reference.OUTPUT diff --git a/python/cudf/cudf/_lib/cpp/ast.pxd b/python/cudf/cudf/_lib/cpp/ast.pxd new file mode 100644 index 00000000000..a8f130a17cf --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/ast.pxd @@ -0,0 +1,72 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from libc.stdint cimport ( + int32_t, int64_t +) +from libcpp cimport bool +from libcpp.string cimport string + +from cudf._lib.cpp.types cimport data_type +from cudf._lib.cpp.wrappers.decimals cimport scale_type + +from cudf._lib.cpp.scalar.scalar cimport ( + numeric_scalar, + timestamp_scalar, + duration_scalar +) + +cdef extern from "cudf/ast/operators.hpp" namespace "cudf::ast" nogil: + ctypedef enum ast_operator: + # Binary operators + ADD "cudf::ast::ast_operator::ADD" + SUB "cudf::ast::ast_operator::SUB" + MUL "cudf::ast::ast_operator::MUL" + DIV "cudf::ast::ast_operator::DIV" + TRUE_DIV "cudf::ast::ast_operator::TRUE_DIV" + FLOOR_DIV "cudf::ast::ast_operator::FLOOR_DIV" + MOD "cudf::ast::ast_operator::MOD" + PYMOD "cudf::ast::ast_operator::PYMOD" + POW "cudf::ast::ast_operator::POW" + EQUAL "cudf::ast::ast_operator::EQUAL" + NOT_EQUAL "cudf::ast::ast_operator::NOT_EQUAL" + LESS "cudf::ast::ast_operator::LESS" + GREATER "cudf::ast::ast_operator::GREATER" + LESS_EQUAL "cudf::ast::ast_operator::LESS_EQUAL" + GREATER_EQUAL "cudf::ast::ast_operator::GREATER_EQUAL" + BITWISE_AND "cudf::ast::ast_operator::BITWISE_AND" + BITWISE_OR "cudf::ast::ast_operator::BITWISE_OR" + BITWISE_XOR "cudf::ast::ast_operator::BITWISE_XOR" + LOGICAL_AND "cudf::ast::ast_operator::LOGICAL_AND" + LOGICAL_OR "cudf::ast::ast_operator::LOGICAL_OR" + # Unary operators + IDENTITY "cudf::ast::ast_operator::IDENTITY" + SIN "cudf::ast::ast_operator::SIN" + COS "cudf::ast::ast_operator::COS" + TAN "cudf::ast::ast_operator::TAN" + ARCSIN "cudf::ast::ast_operator::ARCSIN" + ARCCOS "cudf::ast::ast_operator::ARCCOS" + ARCTAN "cudf::ast::ast_operator::ARCTAN" + SINH "cudf::ast::ast_operator::SINH" + COSH "cudf::ast::ast_operator::COSH" + TANH "cudf::ast::ast_operator::TANH" + ARCSINH "cudf::ast::ast_operator::ARCSINH" + ARCCOSH "cudf::ast::ast_operator::ARCCOSH" + ARCTANH "cudf::ast::ast_operator::ARCTANH" + EXP "cudf::ast::ast_operator::EXP" + LOG "cudf::ast::ast_operator::LOG" + SQRT "cudf::ast::ast_operator::SQRT" + CBRT "cudf::ast::ast_operator::CBRT" + CEIL "cudf::ast::ast_operator::CEIL" + FLOOR "cudf::ast::ast_operator::FLOOR" + ABS "cudf::ast::ast_operator::ABS" + RINT "cudf::ast::ast_operator::RINT" + BIT_INVERT "cudf::ast::ast_operator::BIT_INVERT" + NOT "cudf::ast::ast_operator::NOT" + +cdef extern from "cudf/ast/linearizer.hpp" namespace "cudf::ast" nogil: + ctypedef enum table_reference: + LEFT "cudf::ast::table_reference::LEFT" + RIGHT "cudf::ast::table_reference::RIGHT" + OUTPUT "cudf::ast::table_reference::OUTPUT" From 8ad2866fa90a48582004b4bf7c42f12e2989fef8 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Apr 2021 11:25:42 -0700 Subject: [PATCH 02/69] Expose limited usage of literal to Python. --- python/cudf/cudf/_lib/ast.pyx | 23 +++++++++++------------ python/cudf/cudf/_lib/cpp/ast.pxd | 7 +++++++ 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index a089d13b042..504725d6cc5 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -3,18 +3,7 @@ import numpy as np from enum import Enum -# from libc.stdint cimport uint32_t -# from libcpp cimport bool as cbool -# from libcpp.memory cimport unique_ptr -# from libcpp.utility cimport move -# -# from cudf._lib.column cimport Column -# from cudf._lib.replace import replace_nulls -# -# from cudf._lib.cpp.labeling cimport inclusive -# from cudf._lib.cpp.labeling cimport label_bins as cpp_label_bins -# from cudf._lib.cpp.column.column cimport column -# from cudf._lib.cpp.column.column_view cimport column_view +from cython.operator cimport dereference cimport cudf._lib.cpp.ast as libcudf_ast @@ -70,3 +59,13 @@ class TableReference(Enum): LEFT = libcudf_ast.table_reference.LEFT RIGHT = libcudf_ast.table_reference.RIGHT OUTPUT = libcudf_ast.table_reference.OUTPUT + + +cdef class Literal: + def __cinit__(self, value): + self.c_scalar = new numeric_scalar[float](value, True) + self.c_literal = new libcudf_ast.literal( + dereference(self.c_scalar)) + + def __dealloc__(self): + del self.c_literal diff --git a/python/cudf/cudf/_lib/cpp/ast.pxd b/python/cudf/cudf/_lib/cpp/ast.pxd index a8f130a17cf..d3859c654e4 100644 --- a/python/cudf/cudf/_lib/cpp/ast.pxd +++ b/python/cudf/cudf/_lib/cpp/ast.pxd @@ -70,3 +70,10 @@ cdef extern from "cudf/ast/linearizer.hpp" namespace "cudf::ast" nogil: LEFT "cudf::ast::table_reference::LEFT" RIGHT "cudf::ast::table_reference::RIGHT" OUTPUT "cudf::ast::table_reference::OUTPUT" + + cdef cppclass literal: + # Due to https://github.com/cython/cython/issues/3198, we need to + # specify a return type for templated constructors. + literal literal[T](numeric_scalar[T] &) + literal literal[T](timestamp_scalar[T] &) + literal literal[T](duration_scalar[T] &) From cdba8e9784f0318ed8bc08d57536eef7a6fd6030 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Apr 2021 11:42:28 -0700 Subject: [PATCH 03/69] Add column reference to Cython and make all constructors pass through exceptions. --- python/cudf/cudf/_lib/ast.pyx | 10 ++++++++++ python/cudf/cudf/_lib/cpp/ast.pxd | 14 ++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 504725d6cc5..9cea96b5145 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -4,6 +4,7 @@ import numpy as np from enum import Enum from cython.operator cimport dereference +from cudf._lib.cpp.types cimport size_type cimport cudf._lib.cpp.ast as libcudf_ast @@ -69,3 +70,12 @@ cdef class Literal: def __dealloc__(self): del self.c_literal + + +cdef class ColumnReference: + def __cinit__(self, index): + cdef size_type idx = index + self.c_column_reference = new libcudf_ast.column_reference(idx) + + def __dealloc__(self): + del self.c_column_reference diff --git a/python/cudf/cudf/_lib/cpp/ast.pxd b/python/cudf/cudf/_lib/cpp/ast.pxd index d3859c654e4..144ca8d4d41 100644 --- a/python/cudf/cudf/_lib/cpp/ast.pxd +++ b/python/cudf/cudf/_lib/cpp/ast.pxd @@ -8,7 +8,7 @@ from libc.stdint cimport ( from libcpp cimport bool from libcpp.string cimport string -from cudf._lib.cpp.types cimport data_type +from cudf._lib.cpp.types cimport data_type, size_type from cudf._lib.cpp.wrappers.decimals cimport scale_type from cudf._lib.cpp.scalar.scalar cimport ( @@ -74,6 +74,12 @@ cdef extern from "cudf/ast/linearizer.hpp" namespace "cudf::ast" nogil: cdef cppclass literal: # Due to https://github.com/cython/cython/issues/3198, we need to # specify a return type for templated constructors. - literal literal[T](numeric_scalar[T] &) - literal literal[T](timestamp_scalar[T] &) - literal literal[T](duration_scalar[T] &) + literal literal[T](numeric_scalar[T] &) except + + literal literal[T](timestamp_scalar[T] &) except + + literal literal[T](duration_scalar[T] &) except + + + cdef cppclass column_reference: + # Allow for default C++ parameters by declaring multiple constructors + # with the default parameters optionally omitted. + column_reference(size_type) except + + column_reference(size_type, table_reference) except + From dfd9e256e8b01029741b4c5a6829a6024fdb2f84 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Apr 2021 11:46:34 -0700 Subject: [PATCH 04/69] Switch raw pointers to unique pointers and remove manual deallocation. --- python/cudf/cudf/_lib/ast.pyx | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 9cea96b5145..3e5db7c12f2 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -5,6 +5,7 @@ from enum import Enum from cython.operator cimport dereference from cudf._lib.cpp.types cimport size_type +from libcpp.memory cimport make_unique cimport cudf._lib.cpp.ast as libcudf_ast @@ -64,18 +65,14 @@ class TableReference(Enum): cdef class Literal: def __cinit__(self, value): - self.c_scalar = new numeric_scalar[float](value, True) - self.c_literal = new libcudf_ast.literal( + cdef float val = value + self.c_scalar = make_unique[numeric_scalar[float]](val, True) + self.c_literal = make_unique[libcudf_ast.literal]( dereference(self.c_scalar)) - def __dealloc__(self): - del self.c_literal - cdef class ColumnReference: def __cinit__(self, index): cdef size_type idx = index - self.c_column_reference = new libcudf_ast.column_reference(idx) - - def __dealloc__(self): - del self.c_column_reference + self.c_column_reference = make_unique[libcudf_ast.column_reference]( + idx) From 7fc023e6e7be294087b1a10968247b19a23f9f7e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Apr 2021 14:22:26 -0700 Subject: [PATCH 05/69] Switch unique pointers to shared pointers and add expression to Cython, using the shared pointer as an entrypoint for passing the underlying object through to C++. --- cpp/include/cudf/ast/expressions.hpp | 4 ++- python/cudf/cudf/_lib/ast.pxd | 34 ++++++++++++++++++++++ python/cudf/cudf/_lib/ast.pyx | 43 ++++++++++++++++++++++++---- python/cudf/cudf/_lib/cpp/ast.pxd | 13 +++++++-- 4 files changed, 85 insertions(+), 9 deletions(-) create mode 100644 python/cudf/cudf/_lib/ast.pxd diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp index eb98e0e0bee..5a176ebacf3 100644 --- a/cpp/include/cudf/ast/expressions.hpp +++ b/cpp/include/cudf/ast/expressions.hpp @@ -21,6 +21,8 @@ #include #include +#include + namespace cudf { namespace ast { @@ -53,7 +55,7 @@ struct expression { /** * @brief Enum of supported operators. */ -enum class ast_operator { +enum class ast_operator : int32_t { // Binary operators ADD, ///< operator + SUB, ///< operator - diff --git a/python/cudf/cudf/_lib/ast.pxd b/python/cudf/cudf/_lib/ast.pxd new file mode 100644 index 00000000000..fbc6a214d50 --- /dev/null +++ b/python/cudf/cudf/_lib/ast.pxd @@ -0,0 +1,34 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport shared_ptr + +from libc.stdint cimport int32_t +from cudf._lib.cpp.scalar.scalar cimport numeric_scalar +from cudf._lib.cpp.ast cimport literal, column_reference, expression, node + + +# Since Cython <3 doesn't support scoped enumerations but attempts to work with +# the underlying value of an enum, typedefing this to cast seems unavoidable. +ctypedef int32_t underlying_type_ast_operator + + +cdef class Node: + # Using a shared pointer here to standardize getting the underlying raw + # pointer for passing to C++ across all subclasses. + # TODO: Consider making this a weak pointer. Probably not worth the extra + # effort though since these classes are part of a hierarchy anyway. + cdef shared_ptr[node] c_node + cdef node * _get_ptr(self) + + +cdef class Literal(Node): + cdef shared_ptr[numeric_scalar[float]] c_scalar + cdef shared_ptr[literal] c_obj + + +cdef class ColumnReference(Node): + cdef shared_ptr[column_reference] c_obj + + +cdef class Expression(Node): + cdef shared_ptr[expression] c_obj diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 3e5db7c12f2..c817fc6ac50 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -5,7 +5,8 @@ from enum import Enum from cython.operator cimport dereference from cudf._lib.cpp.types cimport size_type -from libcpp.memory cimport make_unique +from libcpp.memory cimport make_shared, shared_ptr +from cudf._lib.ast cimport underlying_type_ast_operator cimport cudf._lib.cpp.ast as libcudf_ast @@ -63,16 +64,46 @@ class TableReference(Enum): OUTPUT = libcudf_ast.table_reference.OUTPUT -cdef class Literal: +cdef class Node: + cdef libcudf_ast.node * _get_ptr(self): + return self.c_node.get() + + +cdef class Literal(Node): def __cinit__(self, value): cdef float val = value - self.c_scalar = make_unique[numeric_scalar[float]](val, True) - self.c_literal = make_unique[libcudf_ast.literal]( + self.c_scalar = make_shared[numeric_scalar[float]](val, True) + self.c_obj = make_shared[libcudf_ast.literal]( dereference(self.c_scalar)) + self.c_node = self.c_obj -cdef class ColumnReference: +cdef class ColumnReference(Node): def __cinit__(self, index): cdef size_type idx = index - self.c_column_reference = make_unique[libcudf_ast.column_reference]( + self.c_obj = make_shared[libcudf_ast.column_reference]( idx) + self.c_node = self.c_obj + + +cdef class Expression(Node): + def __cinit__(self, op, Node left, Node right=None): + # This awkward double casting appears to be the only way to get Cython + # to generate valid C++ that doesn't try to apply the shift operator + # directly to values of the enum (which is invalid). + cdef libcudf_ast.ast_operator op_value = ( + op.value) + + # left and right are either Expression, ColumnReference, or Literal + cdef const libcudf_ast.node *c_left = left._get_ptr() + cdef const libcudf_ast.node *c_right = right._get_ptr() + + if right is None: + self.c_obj = make_shared[libcudf_ast.expression]( + op_value, dereference(c_left)) + else: + self.c_obj = make_shared[libcudf_ast.expression]( + op_value, dereference(c_left), + dereference(c_right)) + + self.c_node = self.c_obj diff --git a/python/cudf/cudf/_lib/cpp/ast.pxd b/python/cudf/cudf/_lib/cpp/ast.pxd index 144ca8d4d41..47b6177fb3e 100644 --- a/python/cudf/cudf/_lib/cpp/ast.pxd +++ b/python/cudf/cudf/_lib/cpp/ast.pxd @@ -65,21 +65,30 @@ cdef extern from "cudf/ast/operators.hpp" namespace "cudf::ast" nogil: BIT_INVERT "cudf::ast::ast_operator::BIT_INVERT" NOT "cudf::ast::ast_operator::NOT" +cdef extern from "cudf/ast/detail/linearizer.hpp" \ + namespace "cudf::ast::detail" nogil: + cdef cppclass node: + pass + cdef extern from "cudf/ast/linearizer.hpp" namespace "cudf::ast" nogil: ctypedef enum table_reference: LEFT "cudf::ast::table_reference::LEFT" RIGHT "cudf::ast::table_reference::RIGHT" OUTPUT "cudf::ast::table_reference::OUTPUT" - cdef cppclass literal: + cdef cppclass literal(node): # Due to https://github.com/cython/cython/issues/3198, we need to # specify a return type for templated constructors. literal literal[T](numeric_scalar[T] &) except + literal literal[T](timestamp_scalar[T] &) except + literal literal[T](duration_scalar[T] &) except + - cdef cppclass column_reference: + cdef cppclass column_reference(node): # Allow for default C++ parameters by declaring multiple constructors # with the default parameters optionally omitted. column_reference(size_type) except + column_reference(size_type, table_reference) except + + + cdef cppclass expression(node): + expression(ast_operator, const node &) + expression(ast_operator, const node&, const node&) From 0e716e04e079394932e6314b01e66100c2585ee2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Apr 2021 14:47:00 -0700 Subject: [PATCH 06/69] Export compute_column to Python. --- python/cudf/cudf/_lib/ast.pyx | 1 + python/cudf/cudf/_lib/cpp/ast.pxd | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index c817fc6ac50..2c792a860ea 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -71,6 +71,7 @@ cdef class Node: cdef class Literal(Node): def __cinit__(self, value): + # TODO: Generalize this to other types of literals. cdef float val = value self.c_scalar = make_shared[numeric_scalar[float]](val, True) self.c_obj = make_shared[libcudf_ast.literal]( diff --git a/python/cudf/cudf/_lib/cpp/ast.pxd b/python/cudf/cudf/_lib/cpp/ast.pxd index 47b6177fb3e..79ba2927970 100644 --- a/python/cudf/cudf/_lib/cpp/ast.pxd +++ b/python/cudf/cudf/_lib/cpp/ast.pxd @@ -2,20 +2,16 @@ from libcpp.memory cimport unique_ptr -from libc.stdint cimport ( - int32_t, int64_t -) -from libcpp cimport bool -from libcpp.string cimport string - -from cudf._lib.cpp.types cimport data_type, size_type -from cudf._lib.cpp.wrappers.decimals cimport scale_type +from cudf._lib.cpp.types cimport size_type from cudf._lib.cpp.scalar.scalar cimport ( numeric_scalar, timestamp_scalar, duration_scalar ) +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.table.table_view cimport table_view + cdef extern from "cudf/ast/operators.hpp" namespace "cudf::ast" nogil: ctypedef enum ast_operator: @@ -92,3 +88,9 @@ cdef extern from "cudf/ast/linearizer.hpp" namespace "cudf::ast" nogil: cdef cppclass expression(node): expression(ast_operator, const node &) expression(ast_operator, const node&, const node&) + +cdef extern from "cudf/ast/transform.hpp" namespace "cudf::ast" nogil: + cdef unique_ptr[column] compute_column( + const table_view table, + const expression &expr + ) From f5db47f0242530c1cb03d1671473f0c87538c607 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Apr 2021 14:51:01 -0700 Subject: [PATCH 07/69] Minor Cython cleanup. --- python/cudf/cudf/_lib/ast.pyx | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 2c792a860ea..e51a388296b 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -1,6 +1,5 @@ # Copyright (c) 2021, NVIDIA CORPORATION. -import numpy as np from enum import Enum from cython.operator cimport dereference @@ -82,8 +81,7 @@ cdef class Literal(Node): cdef class ColumnReference(Node): def __cinit__(self, index): cdef size_type idx = index - self.c_obj = make_shared[libcudf_ast.column_reference]( - idx) + self.c_obj = make_shared[libcudf_ast.column_reference](idx) self.c_node = self.c_obj @@ -95,16 +93,16 @@ cdef class Expression(Node): cdef libcudf_ast.ast_operator op_value = ( op.value) - # left and right are either Expression, ColumnReference, or Literal - cdef const libcudf_ast.node *c_left = left._get_ptr() - cdef const libcudf_ast.node *c_right = right._get_ptr() - if right is None: self.c_obj = make_shared[libcudf_ast.expression]( - op_value, dereference(c_left)) + op_value, + dereference(left._get_ptr()) + ) else: self.c_obj = make_shared[libcudf_ast.expression]( - op_value, dereference(c_left), - dereference(c_right)) + op_value, + dereference(left._get_ptr()), + dereference(right._get_ptr()) + ) self.c_node = self.c_obj From 98049b97d55a7b91fba24e3fa642245eea5cde4f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Apr 2021 16:01:33 -0700 Subject: [PATCH 08/69] Add basic Python API. --- python/cudf/cudf/_lib/ast.pyx | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index e51a388296b..c31c9cf25f4 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -2,10 +2,17 @@ from enum import Enum +from cudf.core.column_accessor import ColumnAccessor +from cudf.core.dataframe import DataFrame + from cython.operator cimport dereference from cudf._lib.cpp.types cimport size_type -from libcpp.memory cimport make_shared, shared_ptr +from libcpp.memory cimport make_shared, shared_ptr, unique_ptr +from libcpp.utility cimport move from cudf._lib.ast cimport underlying_type_ast_operator +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column cimport column +from cudf._lib.table cimport Table cimport cudf._lib.cpp.ast as libcudf_ast @@ -106,3 +113,18 @@ cdef class Expression(Node): ) self.c_node = self.c_obj + + +cdef evaluate_expression_internal(Table values, Expression expr): + result_data = ColumnAccessor() + cdef unique_ptr[column] col = libcudf_ast.compute_column( + values.view(), + dereference(expr.c_obj.get()) + ) + result_data['result'] = Column.from_unique_ptr(move(col)) + result_table = Table(data=result_data) + return DataFrame._from_table(result_table) + + +def evaluate_expression(df, expr): + return evaluate_expression_internal(df, expr) From ea8d39e7b13c0f7a131bd396f83587230000a88a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 21 Apr 2021 10:43:14 -0700 Subject: [PATCH 09/69] Add Python AST parsing functionality to simplify expression construction. --- python/cudf/cudf/_lib/ast.pyx | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index c31c9cf25f4..cf60638cc95 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -1,6 +1,7 @@ # Copyright (c) 2021, NVIDIA CORPORATION. from enum import Enum +import ast from cudf.core.column_accessor import ColumnAccessor from cudf.core.dataframe import DataFrame @@ -128,3 +129,32 @@ cdef evaluate_expression_internal(Table values, Expression expr): def evaluate_expression(df, expr): return evaluate_expression_internal(df, expr) + + +cpdef ast_visit(node, df, list stack, list temporaries): + # Base cases: Name + if isinstance(node, ast.Name): + stack.append(ColumnReference(df.columns.get_loc(node.id) + 1)) + else: + for field, value in ast.iter_fields(node): + if isinstance(value, ast.UnaryOp): + # TODO: Make actual map, for now just replacing with IDENTITy + # Should be mapping value.op -> ASTOperator.OPERATOR + ast_visit(value.operand, df, stack, temporaries) + temporaries.append(stack.pop()) + stack.append(Expression(ASTOperator.IDENTITY, temporaries[-1])) + elif isinstance(value, ast.BinOp): + # TODO: Make actual map, for now just replacing with + + # pass + ast_visit(value.left, df, stack, temporaries) + ast_visit(value.right, df, stack, temporaries) + temporaries.append(stack.pop()) + temporaries.append(stack.pop()) + stack.append(Expression( + ASTOperator.ADD, temporaries[-1], temporaries[-2])) + elif isinstance(value, list): + for item in value: + if isinstance(item, ast.AST): + ast_visit(item, df, stack, temporaries) + elif isinstance(value, ast.AST): + ast_visit(value, df, stack, temporaries) From d75b5e9cb30ef37d76e7eaadd9f1b1d0009cecce Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 21 Apr 2021 14:04:43 -0700 Subject: [PATCH 10/69] Add map for proper operator mapping from Python AST. --- python/cudf/cudf/_lib/ast.pyx | 60 ++++++++++++++++++++++++++++--- python/cudf/cudf/_lib/cpp/ast.pxd | 2 +- 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index cf60638cc95..5e01c425a41 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -131,6 +131,58 @@ def evaluate_expression(df, expr): return evaluate_expression_internal(df, expr) +# This dictionary encodes the mapping from Python AST operators to their cudf +# counterparts. +python_cudf_ast_map = { + # TODO: Mapping TBD for commented out operators. + # Binary operators + ast.Add: ASTOperator.ADD, + ast.Sub: ASTOperator.SUB, + ast.Mult: ASTOperator.MUL, + ast.Div: ASTOperator.DIV, + # ast.True: ASTOperator.TRUE_DIV, + ast.FloorDiv: ASTOperator.FLOOR_DIV, + ast.Mod: ASTOperator.PYMOD, + # ast.Pymod: ASTOperator.PYMOD, + ast.Pow: ASTOperator.POW, + ast.Eq: ASTOperator.EQUAL, + ast.NotEq: ASTOperator.NOT_EQUAL, + ast.Lt: ASTOperator.LESS, + ast.Gt: ASTOperator.GREATER, + ast.LtE: ASTOperator.LESS_EQUAL, + ast.GtE: ASTOperator.GREATER_EQUAL, + ast.BitAnd: ASTOperator.BITWISE_AND, + ast.BitOr: ASTOperator.BITWISE_OR, + ast.BitXor: ASTOperator.BITWISE_XOR, + ast.And: ASTOperator.LOGICAL_AND, + ast.Or: ASTOperator.LOGICAL_OR, + # Unary operators + # ast.Identity: ASTOperator.IDENTITY, + # ast.Sin: ASTOperator.SIN, + # ast.Cos: ASTOperator.COS, + # ast.Tan: ASTOperator.TAN, + # ast.Arcsin: ASTOperator.ARCSIN, + # ast.Arccos: ASTOperator.ARCCOS, + # ast.Arctan: ASTOperator.ARCTAN, + # ast.Sinh: ASTOperator.SINH, + # ast.Cosh: ASTOperator.COSH, + # ast.Tanh: ASTOperator.TANH, + # ast.Arcsinh: ASTOperator.ARCSINH, + # ast.Arccosh: ASTOperator.ARCCOSH, + # ast.Arctanh: ASTOperator.ARCTANH, + # ast.Exp: ASTOperator.EXP, + # ast.Log: ASTOperator.LOG, + # ast.Sqrt: ASTOperator.SQRT, + # ast.Cbrt: ASTOperator.CBRT, + # ast.Ceil: ASTOperator.CEIL, + # ast.Floor: ASTOperator.FLOOR, + # ast.Abs: ASTOperator.ABS, + # ast.Rint: ASTOperator.RINT, + # ast.Bit: ASTOperator.BIT_INVERT, + # ast.Not: ASTOperator.NOT, +} + + cpdef ast_visit(node, df, list stack, list temporaries): # Base cases: Name if isinstance(node, ast.Name): @@ -138,18 +190,16 @@ cpdef ast_visit(node, df, list stack, list temporaries): else: for field, value in ast.iter_fields(node): if isinstance(value, ast.UnaryOp): - # TODO: Make actual map, for now just replacing with IDENTITy - # Should be mapping value.op -> ASTOperator.OPERATOR ast_visit(value.operand, df, stack, temporaries) + op = python_cudf_ast_map[type(value.op)] temporaries.append(stack.pop()) - stack.append(Expression(ASTOperator.IDENTITY, temporaries[-1])) + stack.append(Expression(op, temporaries[-1])) elif isinstance(value, ast.BinOp): - # TODO: Make actual map, for now just replacing with + - # pass ast_visit(value.left, df, stack, temporaries) ast_visit(value.right, df, stack, temporaries) temporaries.append(stack.pop()) temporaries.append(stack.pop()) + op = python_cudf_ast_map[type(value.op)] stack.append(Expression( ASTOperator.ADD, temporaries[-1], temporaries[-2])) elif isinstance(value, list): diff --git a/python/cudf/cudf/_lib/cpp/ast.pxd b/python/cudf/cudf/_lib/cpp/ast.pxd index 79ba2927970..479f42e8df1 100644 --- a/python/cudf/cudf/_lib/cpp/ast.pxd +++ b/python/cudf/cudf/_lib/cpp/ast.pxd @@ -66,7 +66,7 @@ cdef extern from "cudf/ast/detail/linearizer.hpp" \ cdef cppclass node: pass -cdef extern from "cudf/ast/linearizer.hpp" namespace "cudf::ast" nogil: +cdef extern from "cudf/ast/nodes.hpp" namespace "cudf::ast" nogil: ctypedef enum table_reference: LEFT "cudf::ast::table_reference::LEFT" RIGHT "cudf::ast::table_reference::RIGHT" From ab935a60aebed514c08c026f35a648a2eb59de59 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 21 Apr 2021 14:06:34 -0700 Subject: [PATCH 11/69] Remove unnecessary push/pop ops and add beginnings of new API. --- python/cudf/cudf/_lib/ast.pyx | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 5e01c425a41..9506696d8ed 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -186,25 +186,33 @@ python_cudf_ast_map = { cpdef ast_visit(node, df, list stack, list temporaries): # Base cases: Name if isinstance(node, ast.Name): - stack.append(ColumnReference(df.columns.get_loc(node.id) + 1)) + temporaries.append(ColumnReference(df.columns.get_loc(node.id) + 1)) else: for field, value in ast.iter_fields(node): if isinstance(value, ast.UnaryOp): ast_visit(value.operand, df, stack, temporaries) op = python_cudf_ast_map[type(value.op)] - temporaries.append(stack.pop()) stack.append(Expression(op, temporaries[-1])) elif isinstance(value, ast.BinOp): ast_visit(value.left, df, stack, temporaries) ast_visit(value.right, df, stack, temporaries) - temporaries.append(stack.pop()) - temporaries.append(stack.pop()) op = python_cudf_ast_map[type(value.op)] stack.append(Expression( - ASTOperator.ADD, temporaries[-1], temporaries[-2])) + ASTOperator.ADD, temporaries[-2], temporaries[-1])) elif isinstance(value, list): for item in value: if isinstance(item, ast.AST): ast_visit(item, df, stack, temporaries) elif isinstance(value, ast.AST): ast_visit(value, df, stack, temporaries) + + +def make_and_evaluate_expression(expr, df): + """Create a cudf evaluable expression from a string and evaluate it.""" + # Important: both make and evaluate must be coupled to guarantee that the + # temporaries created (the owning ColumnReferences and Literals) remain in + # scope. + stack = [] + temporaries = [] + ast_visit(ast.parse(expr), df, stack, temporaries) + return evaluate_expression(df, stack[-1]) From 423d47c461205bb11afef25bc9bc84ff93471191 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 21 Apr 2021 14:28:36 -0700 Subject: [PATCH 12/69] Use column names directly rather than passing the df through. --- python/cudf/cudf/_lib/ast.pyx | 42 ++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 9506696d8ed..03fce2418af 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -183,36 +183,46 @@ python_cudf_ast_map = { } -cpdef ast_visit(node, df, list stack, list temporaries): +cdef ast_visit(node, list col_names, list expressions, list column_references): # Base cases: Name if isinstance(node, ast.Name): - temporaries.append(ColumnReference(df.columns.get_loc(node.id) + 1)) + column_references.append(ColumnReference(col_names.index(node.id))) else: for field, value in ast.iter_fields(node): if isinstance(value, ast.UnaryOp): - ast_visit(value.operand, df, stack, temporaries) + ast_visit(value.operand, col_names, expressions, + column_references) op = python_cudf_ast_map[type(value.op)] - stack.append(Expression(op, temporaries[-1])) + expressions.append(Expression(op, column_references[-1])) elif isinstance(value, ast.BinOp): - ast_visit(value.left, df, stack, temporaries) - ast_visit(value.right, df, stack, temporaries) + ast_visit(value.left, col_names, expressions, + column_references) + ast_visit(value.right, col_names, expressions, + column_references) op = python_cudf_ast_map[type(value.op)] - stack.append(Expression( - ASTOperator.ADD, temporaries[-2], temporaries[-1])) + expressions.append(Expression( + ASTOperator.ADD, column_references[-2], + column_references[-1])) elif isinstance(value, list): for item in value: if isinstance(item, ast.AST): - ast_visit(item, df, stack, temporaries) + ast_visit(item, col_names, expressions, + column_references) elif isinstance(value, ast.AST): - ast_visit(value, df, stack, temporaries) + ast_visit(value, col_names, expressions, column_references) + + +def ast_visit_external(node, df, list expressions, list column_references): + ast_visit(node, df.columns.tolist(), expressions, column_references) def make_and_evaluate_expression(expr, df): """Create a cudf evaluable expression from a string and evaluate it.""" # Important: both make and evaluate must be coupled to guarantee that the - # temporaries created (the owning ColumnReferences and Literals) remain in - # scope. - stack = [] - temporaries = [] - ast_visit(ast.parse(expr), df, stack, temporaries) - return evaluate_expression(df, stack[-1]) + # column_references created (the owning ColumnReferences and Literals) + # remain in scope. + expressions = [] + column_references = [] + ast_visit(ast.parse(expr), df.columns.tolist(), expressions, + column_references) + return evaluate_expression(df, expressions[-1]) From 163b1fc2452e74f8ea8a07887ed68eb7337ab79f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 21 Apr 2021 15:00:56 -0700 Subject: [PATCH 13/69] Add better API and implement a non-recursive version of the tree-parsing to test performance. --- python/cudf/cudf/_lib/ast.pyx | 53 +++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 03fce2418af..b6a9c133fb9 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -188,7 +188,7 @@ cdef ast_visit(node, list col_names, list expressions, list column_references): if isinstance(node, ast.Name): column_references.append(ColumnReference(col_names.index(node.id))) else: - for field, value in ast.iter_fields(node): + for value in ast.iter_child_nodes(node): if isinstance(value, ast.UnaryOp): ast_visit(value.operand, col_names, expressions, column_references) @@ -201,7 +201,7 @@ cdef ast_visit(node, list col_names, list expressions, list column_references): column_references) op = python_cudf_ast_map[type(value.op)] expressions.append(Expression( - ASTOperator.ADD, column_references[-2], + op, column_references[-2], column_references[-1])) elif isinstance(value, list): for item in value: @@ -212,6 +212,43 @@ cdef ast_visit(node, list col_names, list expressions, list column_references): ast_visit(value, col_names, expressions, column_references) +cdef ast_visit_manual_stack(node, list col_names, list expressions, list + column_references): + """Perform a non-recursive tree traversal using a manual approach.""" + stack = list(ast.iter_child_nodes(node)) + while stack: + node = stack.pop() + # Base cases: Name + if isinstance(node, ast.Name): + column_references.append(ColumnReference(col_names.index(node.id))) + else: + # This bit is tricky. I need to parse the operand, then come + # back to create the unary op once the operand has been + # translated. Same applies for binary ops. To handle this, I + # need to push back a sentinel of sorts that can be used to + # trigger the new behavior _after_ the operands have been + # parsed. + if isinstance(node, ast.UnaryOp): + stack.append(node.op) + stack.append(node.operand) + elif isinstance(node, ast.unaryop): + op = python_cudf_ast_map[type(node)] + expressions.append(Expression(op, column_references[-1])) + elif isinstance(node, ast.BinOp): + stack.append(node.op) + stack.append(node.left) + stack.append(node.right) + elif isinstance(node, (ast.operator, ast.cmpop, ast.boolop)): + op = python_cudf_ast_map[type(node)] + expressions.append(Expression( + op, column_references[-2], + column_references[-1])) + elif isinstance(node, list): + stack.extend(node) + elif isinstance(node, ast.AST): + stack.extend(ast.iter_child_nodes(node)) + + def ast_visit_external(node, df, list expressions, list column_references): ast_visit(node, df.columns.tolist(), expressions, column_references) @@ -226,3 +263,15 @@ def make_and_evaluate_expression(expr, df): ast_visit(ast.parse(expr), df.columns.tolist(), expressions, column_references) return evaluate_expression(df, expressions[-1]) + + +def make_and_evaluate_expression_nostack(expr, df): + """Create a cudf evaluable expression from a string and evaluate it.""" + # Important: both make and evaluate must be coupled to guarantee that the + # column_references created (the owning ColumnReferences and Literals) + # remain in scope. + expressions = [] + column_references = [] + ast_visit_manual_stack(ast.parse(expr), df.columns.tolist(), expressions, + column_references) + return evaluate_expression(df, expressions[-1]) From eb26e86c30bce11631d7004e18b2921556ae4bf8 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Apr 2021 08:39:42 -0700 Subject: [PATCH 14/69] Remove extra list being passed around. --- python/cudf/cudf/_lib/ast.pyx | 70 ++++++++++++++++------------------- 1 file changed, 31 insertions(+), 39 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index b6a9c133fb9..ffb66f2c070 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -183,44 +183,48 @@ python_cudf_ast_map = { } -cdef ast_visit(node, list col_names, list expressions, list column_references): +cpdef ast_visit(node, list col_names, list nodes): # Base cases: Name if isinstance(node, ast.Name): - column_references.append(ColumnReference(col_names.index(node.id))) + nodes.append(ColumnReference(col_names.index(node.id))) else: for value in ast.iter_child_nodes(node): if isinstance(value, ast.UnaryOp): - ast_visit(value.operand, col_names, expressions, - column_references) + ast_visit(value.operand, col_names, nodes) op = python_cudf_ast_map[type(value.op)] - expressions.append(Expression(op, column_references[-1])) + nodes.append(Expression(op, nodes[-1])) elif isinstance(value, ast.BinOp): - ast_visit(value.left, col_names, expressions, - column_references) - ast_visit(value.right, col_names, expressions, - column_references) + ast_visit(value.left, col_names, nodes) + ast_visit(value.right, col_names, nodes) op = python_cudf_ast_map[type(value.op)] - expressions.append(Expression( - op, column_references[-2], - column_references[-1])) + nodes.append(Expression(op, nodes[-2], nodes[-1])) elif isinstance(value, list): for item in value: if isinstance(item, ast.AST): - ast_visit(item, col_names, expressions, - column_references) + ast_visit(item, col_names, nodes) elif isinstance(value, ast.AST): - ast_visit(value, col_names, expressions, column_references) + ast_visit(value, col_names, nodes) + + +def make_and_evaluate_expression(expr, df): + """Create a cudf evaluable expression from a string and evaluate it.""" + # Important: both make and evaluate must be coupled to guarantee that the + # nodes created (the owning ColumnReferences and Literals) + # remain in scope. + nodes = [] + ast_visit(ast.parse(expr), df.columns.tolist(), nodes) + return evaluate_expression(df, nodes[-1]) cdef ast_visit_manual_stack(node, list col_names, list expressions, list - column_references): + nodes): """Perform a non-recursive tree traversal using a manual approach.""" stack = list(ast.iter_child_nodes(node)) while stack: node = stack.pop() # Base cases: Name if isinstance(node, ast.Name): - column_references.append(ColumnReference(col_names.index(node.id))) + nodes.append(ColumnReference(col_names.index(node.id))) else: # This bit is tricky. I need to parse the operand, then come # back to create the unary op once the operand has been @@ -233,7 +237,7 @@ cdef ast_visit_manual_stack(node, list col_names, list expressions, list stack.append(node.operand) elif isinstance(node, ast.unaryop): op = python_cudf_ast_map[type(node)] - expressions.append(Expression(op, column_references[-1])) + expressions.append(Expression(op, nodes[-1])) elif isinstance(node, ast.BinOp): stack.append(node.op) stack.append(node.left) @@ -241,37 +245,25 @@ cdef ast_visit_manual_stack(node, list col_names, list expressions, list elif isinstance(node, (ast.operator, ast.cmpop, ast.boolop)): op = python_cudf_ast_map[type(node)] expressions.append(Expression( - op, column_references[-2], - column_references[-1])) + op, nodes[-2], + nodes[-1])) elif isinstance(node, list): stack.extend(node) elif isinstance(node, ast.AST): stack.extend(ast.iter_child_nodes(node)) -def ast_visit_external(node, df, list expressions, list column_references): - ast_visit(node, df.columns.tolist(), expressions, column_references) - - -def make_and_evaluate_expression(expr, df): - """Create a cudf evaluable expression from a string and evaluate it.""" - # Important: both make and evaluate must be coupled to guarantee that the - # column_references created (the owning ColumnReferences and Literals) - # remain in scope. - expressions = [] - column_references = [] - ast_visit(ast.parse(expr), df.columns.tolist(), expressions, - column_references) - return evaluate_expression(df, expressions[-1]) - - def make_and_evaluate_expression_nostack(expr, df): """Create a cudf evaluable expression from a string and evaluate it.""" # Important: both make and evaluate must be coupled to guarantee that the - # column_references created (the owning ColumnReferences and Literals) + # nodes created (the owning ColumnReferences and Literals) # remain in scope. expressions = [] - column_references = [] + nodes = [] ast_visit_manual_stack(ast.parse(expr), df.columns.tolist(), expressions, - column_references) + nodes) return evaluate_expression(df, expressions[-1]) + + +def ast_visit_external(node, df, list nodes): + ast_visit(node, df.columns.tolist(), nodes) From c16c3be36e55326497430f8c5f15cdb7b5b56de5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Apr 2021 11:42:25 -0700 Subject: [PATCH 15/69] Explicitly separate the parse stack from the stored list of nodes to enable compound expressions. --- python/cudf/cudf/_lib/ast.pyx | 43 ++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index ffb66f2c070..cffb544ea6a 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -183,37 +183,48 @@ python_cudf_ast_map = { } -cpdef ast_visit(node, list col_names, list nodes): +cpdef ast_visit(node, list col_names, list stack, list nodes): # Base cases: Name if isinstance(node, ast.Name): - nodes.append(ColumnReference(col_names.index(node.id))) + stack.append(ColumnReference(col_names.index(node.id) + 1)) else: for value in ast.iter_child_nodes(node): if isinstance(value, ast.UnaryOp): - ast_visit(value.operand, col_names, nodes) + # TODO: I think here we can optimize by just calling on + # value.operand, need to verify. + ast_visit(value.operand, col_names, stack, nodes) op = python_cudf_ast_map[type(value.op)] - nodes.append(Expression(op, nodes[-1])) + operand = stack.pop() + nodes.append(operand) + stack.append(Expression(op, operand)) elif isinstance(value, ast.BinOp): - ast_visit(value.left, col_names, nodes) - ast_visit(value.right, col_names, nodes) + ast_visit(value, col_names, stack, nodes) op = python_cudf_ast_map[type(value.op)] - nodes.append(Expression(op, nodes[-2], nodes[-1])) + # TODO: This assumes that left is parsed before right + # (alphabetically). + right = stack.pop() + left = stack.pop() + nodes.append(right) + nodes.append(left) + stack.append(Expression(op, left, right)) elif isinstance(value, list): for item in value: if isinstance(item, ast.AST): - ast_visit(item, col_names, nodes) + ast_visit(item, col_names, stack, nodes) elif isinstance(value, ast.AST): - ast_visit(value, col_names, nodes) + ast_visit(value, col_names, stack, nodes) def make_and_evaluate_expression(expr, df): """Create a cudf evaluable expression from a string and evaluate it.""" # Important: both make and evaluate must be coupled to guarantee that the - # nodes created (the owning ColumnReferences and Literals) - # remain in scope. + # nodes created (the owning ColumnReferences and Literals) remain in scope. + stack = [] nodes = [] - ast_visit(ast.parse(expr), df.columns.tolist(), nodes) - return evaluate_expression(df, nodes[-1]) + ast_visit(ast.parse(expr), df.columns.tolist(), stack, nodes) + # At the end, all the stack contains is the expression to evaluate. + expr = stack[0] + return evaluate_expression(df, stack[-1]) cdef ast_visit_manual_stack(node, list col_names, list expressions, list @@ -224,7 +235,7 @@ cdef ast_visit_manual_stack(node, list col_names, list expressions, list node = stack.pop() # Base cases: Name if isinstance(node, ast.Name): - nodes.append(ColumnReference(col_names.index(node.id))) + nodes.append(ColumnReference(col_names.index(node.id) + 1)) else: # This bit is tricky. I need to parse the operand, then come # back to create the unary op once the operand has been @@ -265,5 +276,5 @@ def make_and_evaluate_expression_nostack(expr, df): return evaluate_expression(df, expressions[-1]) -def ast_visit_external(node, df, list nodes): - ast_visit(node, df.columns.tolist(), nodes) +def ast_visit_external(node, df, list stack, list nodes): + ast_visit(node, df.columns.tolist(), stack, nodes) From e3748aaa698da3ec6af480f0501348560013bde1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Apr 2021 12:09:00 -0700 Subject: [PATCH 16/69] Separate all logic into individual lines for profiling. --- python/cudf/cudf/_lib/ast.pyx | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index cffb544ea6a..efadf084cf9 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -186,9 +186,12 @@ python_cudf_ast_map = { cpdef ast_visit(node, list col_names, list stack, list nodes): # Base cases: Name if isinstance(node, ast.Name): - stack.append(ColumnReference(col_names.index(node.id) + 1)) + idx = col_names.index(node.id) + 1 + ref = ColumnReference(idx) + stack.append(ref) else: - for value in ast.iter_child_nodes(node): + for field in node._fields: + value = getattr(node, field) if isinstance(value, ast.UnaryOp): # TODO: I think here we can optimize by just calling on # value.operand, need to verify. @@ -196,7 +199,8 @@ cpdef ast_visit(node, list col_names, list stack, list nodes): op = python_cudf_ast_map[type(value.op)] operand = stack.pop() nodes.append(operand) - stack.append(Expression(op, operand)) + expr = Expression(op, operand) + stack.append(expr) elif isinstance(value, ast.BinOp): ast_visit(value, col_names, stack, nodes) op = python_cudf_ast_map[type(value.op)] @@ -206,7 +210,8 @@ cpdef ast_visit(node, list col_names, list stack, list nodes): left = stack.pop() nodes.append(right) nodes.append(left) - stack.append(Expression(op, left, right)) + expr = Expression(op, left, right) + stack.append(expr) elif isinstance(value, list): for item in value: if isinstance(item, ast.AST): From 225ff96e55eb91ffafe1a9e603c9384313bd7d84 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Apr 2021 12:41:01 -0700 Subject: [PATCH 17/69] Simplify logic by only storing a single pointer and rely on casting it at the end. --- python/cudf/cudf/_lib/ast.pxd | 12 +++--------- python/cudf/cudf/_lib/ast.pyx | 31 ++++++++++++------------------- 2 files changed, 15 insertions(+), 28 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pxd b/python/cudf/cudf/_lib/ast.pxd index fbc6a214d50..d30cdcc79c7 100644 --- a/python/cudf/cudf/_lib/ast.pxd +++ b/python/cudf/cudf/_lib/ast.pxd @@ -13,22 +13,16 @@ ctypedef int32_t underlying_type_ast_operator cdef class Node: - # Using a shared pointer here to standardize getting the underlying raw - # pointer for passing to C++ across all subclasses. - # TODO: Consider making this a weak pointer. Probably not worth the extra - # effort though since these classes are part of a hierarchy anyway. - cdef shared_ptr[node] c_node - cdef node * _get_ptr(self) + cdef shared_ptr[node] c_obj cdef class Literal(Node): cdef shared_ptr[numeric_scalar[float]] c_scalar - cdef shared_ptr[literal] c_obj cdef class ColumnReference(Node): - cdef shared_ptr[column_reference] c_obj + pass cdef class Expression(Node): - cdef shared_ptr[expression] c_obj + pass diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index efadf084cf9..8bb295ac5e6 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -1,3 +1,4 @@ +# cython: binding=True, linetrace=True # Copyright (c) 2021, NVIDIA CORPORATION. from enum import Enum @@ -8,7 +9,7 @@ from cudf.core.dataframe import DataFrame from cython.operator cimport dereference from cudf._lib.cpp.types cimport size_type -from libcpp.memory cimport make_shared, shared_ptr, unique_ptr +from libcpp.memory cimport make_shared, shared_ptr, unique_ptr, static_pointer_cast from libcpp.utility cimport move from cudf._lib.ast cimport underlying_type_ast_operator from cudf._lib.column cimport Column @@ -72,8 +73,7 @@ class TableReference(Enum): cdef class Node: - cdef libcudf_ast.node * _get_ptr(self): - return self.c_node.get() + pass cdef class Literal(Node): @@ -81,16 +81,13 @@ cdef class Literal(Node): # TODO: Generalize this to other types of literals. cdef float val = value self.c_scalar = make_shared[numeric_scalar[float]](val, True) - self.c_obj = make_shared[libcudf_ast.literal]( + self.c_obj = make_shared[libcudf_ast.literal]( dereference(self.c_scalar)) - self.c_node = self.c_obj cdef class ColumnReference(Node): - def __cinit__(self, index): - cdef size_type idx = index - self.c_obj = make_shared[libcudf_ast.column_reference](idx) - self.c_node = self.c_obj + def __cinit__(self, size_type index): + self.c_obj = make_shared[libcudf_ast.column_reference](index) cdef class Expression(Node): @@ -102,25 +99,21 @@ cdef class Expression(Node): op.value) if right is None: - self.c_obj = make_shared[libcudf_ast.expression]( - op_value, - dereference(left._get_ptr()) + self.c_obj = make_shared[libcudf_ast.expression]( + op_value, dereference(left.c_obj) ) else: - self.c_obj = make_shared[libcudf_ast.expression]( - op_value, - dereference(left._get_ptr()), - dereference(right._get_ptr()) + self.c_obj = make_shared[libcudf_ast.expression]( + op_value, dereference(left.c_obj), dereference(right.c_obj) ) - self.c_node = self.c_obj - cdef evaluate_expression_internal(Table values, Expression expr): result_data = ColumnAccessor() + cdef shared_ptr[libcudf_ast.expression] c_expr_ptr = static_pointer_cast[libcudf_ast.expression, libcudf_ast.node](expr.c_obj) cdef unique_ptr[column] col = libcudf_ast.compute_column( values.view(), - dereference(expr.c_obj.get()) + dereference(c_expr_ptr.get()) ) result_data['result'] = Column.from_unique_ptr(move(col)) result_table = Table(data=result_data) From 77b871e7dab8479b7f41a1fe72731998c9cc0f0c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Apr 2021 12:49:42 -0700 Subject: [PATCH 18/69] Switch back from shared_ptr to unique_ptr. --- python/cudf/cudf/_lib/ast.pxd | 6 +++--- python/cudf/cudf/_lib/ast.pyx | 25 +++++++++++++------------ 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pxd b/python/cudf/cudf/_lib/ast.pxd index d30cdcc79c7..21d38fd1aba 100644 --- a/python/cudf/cudf/_lib/ast.pxd +++ b/python/cudf/cudf/_lib/ast.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2021, NVIDIA CORPORATION. -from libcpp.memory cimport shared_ptr +from libcpp.memory cimport unique_ptr from libc.stdint cimport int32_t from cudf._lib.cpp.scalar.scalar cimport numeric_scalar @@ -13,11 +13,11 @@ ctypedef int32_t underlying_type_ast_operator cdef class Node: - cdef shared_ptr[node] c_obj + cdef unique_ptr[node] c_obj cdef class Literal(Node): - cdef shared_ptr[numeric_scalar[float]] c_scalar + cdef unique_ptr[numeric_scalar[float]] c_scalar cdef class ColumnReference(Node): diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 8bb295ac5e6..7ff33aedecc 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -9,7 +9,7 @@ from cudf.core.dataframe import DataFrame from cython.operator cimport dereference from cudf._lib.cpp.types cimport size_type -from libcpp.memory cimport make_shared, shared_ptr, unique_ptr, static_pointer_cast +from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move from cudf._lib.ast cimport underlying_type_ast_operator from cudf._lib.column cimport Column @@ -80,14 +80,16 @@ cdef class Literal(Node): def __cinit__(self, value): # TODO: Generalize this to other types of literals. cdef float val = value - self.c_scalar = make_shared[numeric_scalar[float]](val, True) - self.c_obj = make_shared[libcudf_ast.literal]( - dereference(self.c_scalar)) + self.c_scalar = make_unique[numeric_scalar[float]](val, True) + self.c_obj = make_unique[ + libcudf_ast.literal]( + dereference(self.c_scalar)) cdef class ColumnReference(Node): def __cinit__(self, size_type index): - self.c_obj = make_shared[libcudf_ast.column_reference](index) + self.c_obj = make_unique[ + libcudf_ast.column_reference](index) cdef class Expression(Node): @@ -99,21 +101,20 @@ cdef class Expression(Node): op.value) if right is None: - self.c_obj = make_shared[libcudf_ast.expression]( - op_value, dereference(left.c_obj) - ) + self.c_obj = make_unique[ + libcudf_ast.expression](op_value, dereference(left.c_obj)) else: - self.c_obj = make_shared[libcudf_ast.expression]( - op_value, dereference(left.c_obj), dereference(right.c_obj) + self.c_obj = make_unique[ + libcudf_ast.expression]( + op_value, dereference(left.c_obj), dereference(right.c_obj) ) cdef evaluate_expression_internal(Table values, Expression expr): result_data = ColumnAccessor() - cdef shared_ptr[libcudf_ast.expression] c_expr_ptr = static_pointer_cast[libcudf_ast.expression, libcudf_ast.node](expr.c_obj) cdef unique_ptr[column] col = libcudf_ast.compute_column( values.view(), - dereference(c_expr_ptr.get()) + dereference(expr.c_obj.get()) ) result_data['result'] = Column.from_unique_ptr(move(col)) result_table = Table(data=result_data) From d2da70345693bc2c7549b9f70f079f7490d30a93 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Apr 2021 14:27:32 -0700 Subject: [PATCH 19/69] Substitute df._column_names for df.columns.tolist(). --- python/cudf/cudf/_lib/ast.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 7ff33aedecc..4d578a30fe2 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -220,9 +220,10 @@ def make_and_evaluate_expression(expr, df): # nodes created (the owning ColumnReferences and Literals) remain in scope. stack = [] nodes = [] - ast_visit(ast.parse(expr), df.columns.tolist(), stack, nodes) + parsed_expr = ast.parse(expr) + col_list = list(df._column_names) + ast_visit(parsed_expr, col_list, stack, nodes) # At the end, all the stack contains is the expression to evaluate. - expr = stack[0] return evaluate_expression(df, stack[-1]) From b37e5051450fbe2388ddddfacc30cf1d1ebe3583 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Apr 2021 15:04:51 -0700 Subject: [PATCH 20/69] Delete older unnecessary versions. --- python/cudf/cudf/_lib/ast.pyx | 57 ----------------------------------- 1 file changed, 57 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 4d578a30fe2..8d62ea29bc4 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -72,10 +72,6 @@ class TableReference(Enum): OUTPUT = libcudf_ast.table_reference.OUTPUT -cdef class Node: - pass - - cdef class Literal(Node): def __cinit__(self, value): # TODO: Generalize this to other types of literals. @@ -225,56 +221,3 @@ def make_and_evaluate_expression(expr, df): ast_visit(parsed_expr, col_list, stack, nodes) # At the end, all the stack contains is the expression to evaluate. return evaluate_expression(df, stack[-1]) - - -cdef ast_visit_manual_stack(node, list col_names, list expressions, list - nodes): - """Perform a non-recursive tree traversal using a manual approach.""" - stack = list(ast.iter_child_nodes(node)) - while stack: - node = stack.pop() - # Base cases: Name - if isinstance(node, ast.Name): - nodes.append(ColumnReference(col_names.index(node.id) + 1)) - else: - # This bit is tricky. I need to parse the operand, then come - # back to create the unary op once the operand has been - # translated. Same applies for binary ops. To handle this, I - # need to push back a sentinel of sorts that can be used to - # trigger the new behavior _after_ the operands have been - # parsed. - if isinstance(node, ast.UnaryOp): - stack.append(node.op) - stack.append(node.operand) - elif isinstance(node, ast.unaryop): - op = python_cudf_ast_map[type(node)] - expressions.append(Expression(op, nodes[-1])) - elif isinstance(node, ast.BinOp): - stack.append(node.op) - stack.append(node.left) - stack.append(node.right) - elif isinstance(node, (ast.operator, ast.cmpop, ast.boolop)): - op = python_cudf_ast_map[type(node)] - expressions.append(Expression( - op, nodes[-2], - nodes[-1])) - elif isinstance(node, list): - stack.extend(node) - elif isinstance(node, ast.AST): - stack.extend(ast.iter_child_nodes(node)) - - -def make_and_evaluate_expression_nostack(expr, df): - """Create a cudf evaluable expression from a string and evaluate it.""" - # Important: both make and evaluate must be coupled to guarantee that the - # nodes created (the owning ColumnReferences and Literals) - # remain in scope. - expressions = [] - nodes = [] - ast_visit_manual_stack(ast.parse(expr), df.columns.tolist(), expressions, - nodes) - return evaluate_expression(df, expressions[-1]) - - -def ast_visit_external(node, df, list stack, list nodes): - ast_visit(node, df.columns.tolist(), stack, nodes) From f7048d5a88c4164676e0e50aad3377b99da172fa Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Apr 2021 15:10:35 -0700 Subject: [PATCH 21/69] Inline more of the object construction for brevity. --- python/cudf/cudf/_lib/ast.pyx | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 8d62ea29bc4..189d1a30edf 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -176,9 +176,7 @@ python_cudf_ast_map = { cpdef ast_visit(node, list col_names, list stack, list nodes): # Base cases: Name if isinstance(node, ast.Name): - idx = col_names.index(node.id) + 1 - ref = ColumnReference(idx) - stack.append(ref) + stack.append(ColumnReference(col_names.index(node.id) + 1)) else: for field in node._fields: value = getattr(node, field) @@ -187,21 +185,16 @@ cpdef ast_visit(node, list col_names, list stack, list nodes): # value.operand, need to verify. ast_visit(value.operand, col_names, stack, nodes) op = python_cudf_ast_map[type(value.op)] - operand = stack.pop() - nodes.append(operand) - expr = Expression(op, operand) - stack.append(expr) + nodes.append(stack.pop()) + stack.append(Expression(op, nodes[-1])) elif isinstance(value, ast.BinOp): ast_visit(value, col_names, stack, nodes) op = python_cudf_ast_map[type(value.op)] # TODO: This assumes that left is parsed before right # (alphabetically). - right = stack.pop() - left = stack.pop() - nodes.append(right) - nodes.append(left) - expr = Expression(op, left, right) - stack.append(expr) + nodes.append(stack.pop()) + nodes.append(stack.pop()) + stack.append(Expression(op, nodes[-1], nodes[-2])) elif isinstance(value, list): for item in value: if isinstance(item, ast.AST): From e27a2a38308dd4ebf8b4166deda55da995374cb9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Apr 2021 15:21:04 -0700 Subject: [PATCH 22/69] Some further simplification and removal of extra functions. --- python/cudf/cudf/_lib/ast.pyx | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 189d1a30edf..6613838b624 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -1,4 +1,3 @@ -# cython: binding=True, linetrace=True # Copyright (c) 2021, NVIDIA CORPORATION. from enum import Enum @@ -106,21 +105,6 @@ cdef class Expression(Node): ) -cdef evaluate_expression_internal(Table values, Expression expr): - result_data = ColumnAccessor() - cdef unique_ptr[column] col = libcudf_ast.compute_column( - values.view(), - dereference(expr.c_obj.get()) - ) - result_data['result'] = Column.from_unique_ptr(move(col)) - result_table = Table(data=result_data) - return DataFrame._from_table(result_table) - - -def evaluate_expression(df, expr): - return evaluate_expression_internal(df, expr) - - # This dictionary encodes the mapping from Python AST operators to their cudf # counterparts. python_cudf_ast_map = { @@ -173,7 +157,7 @@ python_cudf_ast_map = { } -cpdef ast_visit(node, list col_names, list stack, list nodes): +cpdef ast_visit(node, tuple col_names, list stack, list nodes): # Base cases: Name if isinstance(node, ast.Name): stack.append(ColumnReference(col_names.index(node.id) + 1)) @@ -203,14 +187,23 @@ cpdef ast_visit(node, list col_names, list stack, list nodes): ast_visit(value, col_names, stack, nodes) +def evaluate_expression(Table df, Expression expr): + result_data = ColumnAccessor() + cdef unique_ptr[column] col = libcudf_ast.compute_column( + df.view(), + dereference(expr.c_obj.get()) + ) + result_data['result'] = Column.from_unique_ptr(move(col)) + result_table = Table(data=result_data) + return DataFrame._from_table(result_table) + + def make_and_evaluate_expression(expr, df): """Create a cudf evaluable expression from a string and evaluate it.""" # Important: both make and evaluate must be coupled to guarantee that the # nodes created (the owning ColumnReferences and Literals) remain in scope. stack = [] nodes = [] - parsed_expr = ast.parse(expr) - col_list = list(df._column_names) - ast_visit(parsed_expr, col_list, stack, nodes) + ast_visit(ast.parse(expr), df._column_names, stack, nodes) # At the end, all the stack contains is the expression to evaluate. return evaluate_expression(df, stack[-1]) From 22a6a0e9e851cdc56982c4bf65769e4d972d2e20 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 23 Apr 2021 13:08:45 -0700 Subject: [PATCH 23/69] Basic working version of literals. --- python/cudf/cudf/_lib/ast.pxd | 4 ++-- python/cudf/cudf/_lib/ast.pyx | 15 +++++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pxd b/python/cudf/cudf/_lib/ast.pxd index 21d38fd1aba..b0b2f03388f 100644 --- a/python/cudf/cudf/_lib/ast.pxd +++ b/python/cudf/cudf/_lib/ast.pxd @@ -2,7 +2,7 @@ from libcpp.memory cimport unique_ptr -from libc.stdint cimport int32_t +from libc.stdint cimport int32_t, int64_t from cudf._lib.cpp.scalar.scalar cimport numeric_scalar from cudf._lib.cpp.ast cimport literal, column_reference, expression, node @@ -17,7 +17,7 @@ cdef class Node: cdef class Literal(Node): - cdef unique_ptr[numeric_scalar[float]] c_scalar + cdef unique_ptr[numeric_scalar[int64_t]] c_scalar cdef class ColumnReference(Node): diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 6613838b624..3bf747272d2 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -8,6 +8,7 @@ from cudf.core.dataframe import DataFrame from cython.operator cimport dereference from cudf._lib.cpp.types cimport size_type +from libc.stdint cimport int64_t from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move from cudf._lib.ast cimport underlying_type_ast_operator @@ -74,11 +75,11 @@ class TableReference(Enum): cdef class Literal(Node): def __cinit__(self, value): # TODO: Generalize this to other types of literals. - cdef float val = value - self.c_scalar = make_unique[numeric_scalar[float]](val, True) + cdef int val = value + self.c_scalar = make_unique[numeric_scalar[int64_t]](val, True) self.c_obj = make_unique[ libcudf_ast.literal]( - dereference(self.c_scalar)) + dereference(self.c_scalar)) cdef class ColumnReference(Node): @@ -157,10 +158,16 @@ python_cudf_ast_map = { } -cpdef ast_visit(node, tuple col_names, list stack, list nodes): +cdef ast_visit(node, tuple col_names, list stack, list nodes): # Base cases: Name if isinstance(node, ast.Name): stack.append(ColumnReference(col_names.index(node.id) + 1)) + # elif isinstance(node, (ast.Constant, ast.Num)): + elif isinstance(node, ast.Num): + # TODO: When we drop support for Python 3.7, we can just use + # ast.Constant as a catch-all above. + # stack.append(Literal(node.value)) + stack.append(Literal(node.n)) else: for field in node._fields: value = getattr(node, field) From 567d138f4749e85c7727b02c6cc630d9f36ad03f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 23 Apr 2021 14:53:21 -0700 Subject: [PATCH 24/69] Add docstring, rename ast_visit to ast_traverse, and cleanup code. --- python/cudf/cudf/_lib/ast.pyx | 64 ++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 3bf747272d2..057aaad6730 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -158,28 +158,60 @@ python_cudf_ast_map = { } -cdef ast_visit(node, tuple col_names, list stack, list nodes): +cdef ast_traverse(root, tuple col_names, list stack, list nodes): + """Construct an evaluable libcudf expression by traversing Python AST. + + This function performs a recursive traversal of the provided root + node, constructing column references from names and literal values + from constants, then building up expressions. The final result is + the expression contained in the ``stack`` list once the function has + terminated: this list will always have length one at the end of parsing + a valid expression. + + Parameters + ---------- + root : ast.AST + An ast node generated by :py:func:`ast.parse`. + col_names : tuple + The column names in the data frame, which are used to generate indices + for column references to named columns in the expression. + stack : list + The current set of nodes to process. This list is empty on the initial + call to this function. New elements are added whenever new nodes are + created. When parsing the current root requires creating an Expression + node, a suitable number of elements (corresponding to the arity of the + operator) are popped from the stack as the operands for the operation. + When the recursive traversal is complete, the stack will have length + exactly one and contain the expression to evaluate. + nodes : list + The set of all nodes created while parsing the expression. This + argument is necessary because all C++ node types are non-owning + objects, so if the Python Nodes corresponding to nodes in the + expression go out of scope and are garbage-collected the final + expression will contain references to invalid data and seg fault upon + evaluation. This list must remain in scope until the expression has + been evaluated. + """ # Base cases: Name - if isinstance(node, ast.Name): - stack.append(ColumnReference(col_names.index(node.id) + 1)) - # elif isinstance(node, (ast.Constant, ast.Num)): - elif isinstance(node, ast.Num): - # TODO: When we drop support for Python 3.7, we can just use - # ast.Constant as a catch-all above. - # stack.append(Literal(node.value)) - stack.append(Literal(node.n)) + if isinstance(root, ast.Name): + stack.append(ColumnReference(col_names.index(root.id) + 1)) + # Note: in Python > 3.7 ast.Num is a subclass of ast.Constant. We may need + # to generalize this code eventually if that inheritance is removed. + elif isinstance(root, ast.Num): + stack.append(Literal(root.n)) else: - for field in node._fields: - value = getattr(node, field) + # for value in ast.iter_child_nodes(root): + for field in root._fields: + value = getattr(root, field) if isinstance(value, ast.UnaryOp): # TODO: I think here we can optimize by just calling on # value.operand, need to verify. - ast_visit(value.operand, col_names, stack, nodes) + ast_traverse(value.operand, col_names, stack, nodes) op = python_cudf_ast_map[type(value.op)] nodes.append(stack.pop()) stack.append(Expression(op, nodes[-1])) elif isinstance(value, ast.BinOp): - ast_visit(value, col_names, stack, nodes) + ast_traverse(value, col_names, stack, nodes) op = python_cudf_ast_map[type(value.op)] # TODO: This assumes that left is parsed before right # (alphabetically). @@ -189,9 +221,9 @@ cdef ast_visit(node, tuple col_names, list stack, list nodes): elif isinstance(value, list): for item in value: if isinstance(item, ast.AST): - ast_visit(item, col_names, stack, nodes) + ast_traverse(item, col_names, stack, nodes) elif isinstance(value, ast.AST): - ast_visit(value, col_names, stack, nodes) + ast_traverse(value, col_names, stack, nodes) def evaluate_expression(Table df, Expression expr): @@ -211,6 +243,6 @@ def make_and_evaluate_expression(expr, df): # nodes created (the owning ColumnReferences and Literals) remain in scope. stack = [] nodes = [] - ast_visit(ast.parse(expr), df._column_names, stack, nodes) + ast_traverse(ast.parse(expr), df._column_names, stack, nodes) # At the end, all the stack contains is the expression to evaluate. return evaluate_expression(df, stack[-1]) From eacce8a6d4f05c17e16744e51875faf005a4af04 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 23 Apr 2021 15:28:33 -0700 Subject: [PATCH 25/69] Add support for comparison operators. --- python/cudf/cudf/_lib/ast.pyx | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 057aaad6730..26f492c7e06 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -213,8 +213,20 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): elif isinstance(value, ast.BinOp): ast_traverse(value, col_names, stack, nodes) op = python_cudf_ast_map[type(value.op)] - # TODO: This assumes that left is parsed before right - # (alphabetically). + # TODO: This assumes that left is parsed before right, should + # maybe handle this more explicitly. + nodes.append(stack.pop()) + nodes.append(stack.pop()) + stack.append(Expression(op, nodes[-1], nodes[-2])) + elif isinstance(value, ast.Compare): + if len(value.comparators) != 1: + # TODO: Can relax this comparison by unpacking the + # comparison into multiple. + raise ValueError("Only binary comparisons are supported.") + ast_traverse(value, col_names, stack, nodes) + op = python_cudf_ast_map[type(value.ops[0])] + # TODO: This assumes that left is parsed before comparators, + # should maybe handle this more explicitly. nodes.append(stack.pop()) nodes.append(stack.pop()) stack.append(Expression(op, nodes[-1], nodes[-2])) From 2b2928fa2ca6002fcb68740e84c70553d8f9b04b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 23 Apr 2021 15:46:20 -0700 Subject: [PATCH 26/69] Enable binops, note that we may need to stack a compatibility API over it for pandas query. --- python/cudf/cudf/_lib/ast.pyx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 26f492c7e06..0415c9672b6 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -210,7 +210,12 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): op = python_cudf_ast_map[type(value.op)] nodes.append(stack.pop()) stack.append(Expression(op, nodes[-1])) - elif isinstance(value, ast.BinOp): + # TODO: The expected behavior of the below is questionable. pandas + # converts bitwise operators in query to logical operators, which I + # don't think we want. This might have to be a preprocessing of the + # expression string in df.query that replaces `|` with `or` and `&` + # with `and`. + elif isinstance(value, (ast.BinOp, ast.BoolOp)): ast_traverse(value, col_names, stack, nodes) op = python_cudf_ast_map[type(value.op)] # TODO: This assumes that left is parsed before right, should From e040aa2ef92375695b734014828474704b8d45e4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 23 Apr 2021 15:47:39 -0700 Subject: [PATCH 27/69] Add docstring for evaluate_expression. --- python/cudf/cudf/_lib/ast.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 0415c9672b6..d6d2c255a23 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -244,6 +244,7 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): def evaluate_expression(Table df, Expression expr): + """Evaluate an Expression on a Table.""" result_data = ColumnAccessor() cdef unique_ptr[column] col = libcudf_ast.compute_column( df.view(), From 6f543b28d424314b0591f9083672fa1d2afbd8a4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 25 Aug 2021 10:58:07 -0700 Subject: [PATCH 28/69] Update all Python bindings to new C++ APIs. --- python/cudf/cudf/_lib/ast.pxd | 15 +++++++------- python/cudf/cudf/_lib/ast.pyx | 33 +++++++++++++++++-------------- python/cudf/cudf/_lib/cpp/ast.pxd | 29 +++++++++++++-------------- 3 files changed, 39 insertions(+), 38 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pxd b/python/cudf/cudf/_lib/ast.pxd index b0b2f03388f..9920f45cd2a 100644 --- a/python/cudf/cudf/_lib/ast.pxd +++ b/python/cudf/cudf/_lib/ast.pxd @@ -1,28 +1,27 @@ # Copyright (c) 2021, NVIDIA CORPORATION. +from libc.stdint cimport int32_t, int64_t from libcpp.memory cimport unique_ptr -from libc.stdint cimport int32_t, int64_t +from cudf._lib.cpp.ast cimport column_reference, expression, literal, operation from cudf._lib.cpp.scalar.scalar cimport numeric_scalar -from cudf._lib.cpp.ast cimport literal, column_reference, expression, node - # Since Cython <3 doesn't support scoped enumerations but attempts to work with # the underlying value of an enum, typedefing this to cast seems unavoidable. ctypedef int32_t underlying_type_ast_operator -cdef class Node: - cdef unique_ptr[node] c_obj +cdef class Expression: + cdef unique_ptr[expression] c_obj -cdef class Literal(Node): +cdef class Literal(Expression): cdef unique_ptr[numeric_scalar[int64_t]] c_scalar -cdef class ColumnReference(Node): +cdef class ColumnReference(Expression): pass -cdef class Expression(Node): +cdef class Operation(Expression): pass diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index d6d2c255a23..7cdb5a46916 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -1,23 +1,23 @@ # Copyright (c) 2021, NVIDIA CORPORATION. -from enum import Enum import ast +from enum import Enum from cudf.core.column_accessor import ColumnAccessor from cudf.core.dataframe import DataFrame from cython.operator cimport dereference -from cudf._lib.cpp.types cimport size_type from libc.stdint cimport int64_t from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move + +cimport cudf._lib.cpp.ast as libcudf_ast from cudf._lib.ast cimport underlying_type_ast_operator from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.types cimport size_type from cudf._lib.table cimport Table -cimport cudf._lib.cpp.ast as libcudf_ast - class ASTOperator(Enum): ADD = libcudf_ast.ast_operator.ADD @@ -30,6 +30,7 @@ class ASTOperator(Enum): PYMOD = libcudf_ast.ast_operator.PYMOD POW = libcudf_ast.ast_operator.POW EQUAL = libcudf_ast.ast_operator.EQUAL + NULL_EQUAL = libcudf_ast.ast_operator.NULL_EQUAL NOT_EQUAL = libcudf_ast.ast_operator.NOT_EQUAL LESS = libcudf_ast.ast_operator.LESS GREATER = libcudf_ast.ast_operator.GREATER @@ -39,7 +40,9 @@ class ASTOperator(Enum): BITWISE_OR = libcudf_ast.ast_operator.BITWISE_OR BITWISE_XOR = libcudf_ast.ast_operator.BITWISE_XOR LOGICAL_AND = libcudf_ast.ast_operator.LOGICAL_AND + NULL_LOGICAL_AND = libcudf_ast.ast_operator.NULL_LOGICAL_AND LOGICAL_OR = libcudf_ast.ast_operator.LOGICAL_OR + NULL_LOGICAL_OR = libcudf_ast.ast_operator.NULL_LOGICAL_OR # Unary operators IDENTITY = libcudf_ast.ast_operator.IDENTITY SIN = libcudf_ast.ast_operator.SIN @@ -72,7 +75,7 @@ class TableReference(Enum): OUTPUT = libcudf_ast.table_reference.OUTPUT -cdef class Literal(Node): +cdef class Literal(Expression): def __cinit__(self, value): # TODO: Generalize this to other types of literals. cdef int val = value @@ -82,14 +85,14 @@ cdef class Literal(Node): dereference(self.c_scalar)) -cdef class ColumnReference(Node): +cdef class ColumnReference(Expression): def __cinit__(self, size_type index): self.c_obj = make_unique[ libcudf_ast.column_reference](index) -cdef class Expression(Node): - def __cinit__(self, op, Node left, Node right=None): +cdef class Operation(Expression): + def __cinit__(self, op, Expression left, Expression right=None): # This awkward double casting appears to be the only way to get Cython # to generate valid C++ that doesn't try to apply the shift operator # directly to values of the enum (which is invalid). @@ -178,7 +181,7 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): stack : list The current set of nodes to process. This list is empty on the initial call to this function. New elements are added whenever new nodes are - created. When parsing the current root requires creating an Expression + created. When parsing the current root requires creating an Operation node, a suitable number of elements (corresponding to the arity of the operator) are popped from the stack as the operands for the operation. When the recursive traversal is complete, the stack will have length @@ -186,7 +189,7 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): nodes : list The set of all nodes created while parsing the expression. This argument is necessary because all C++ node types are non-owning - objects, so if the Python Nodes corresponding to nodes in the + objects, so if the Python Expressions corresponding to nodes in the expression go out of scope and are garbage-collected the final expression will contain references to invalid data and seg fault upon evaluation. This list must remain in scope until the expression has @@ -209,7 +212,7 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): ast_traverse(value.operand, col_names, stack, nodes) op = python_cudf_ast_map[type(value.op)] nodes.append(stack.pop()) - stack.append(Expression(op, nodes[-1])) + stack.append(Operation(op, nodes[-1])) # TODO: The expected behavior of the below is questionable. pandas # converts bitwise operators in query to logical operators, which I # don't think we want. This might have to be a preprocessing of the @@ -222,7 +225,7 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): # maybe handle this more explicitly. nodes.append(stack.pop()) nodes.append(stack.pop()) - stack.append(Expression(op, nodes[-1], nodes[-2])) + stack.append(Operation(op, nodes[-1], nodes[-2])) elif isinstance(value, ast.Compare): if len(value.comparators) != 1: # TODO: Can relax this comparison by unpacking the @@ -234,7 +237,7 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): # should maybe handle this more explicitly. nodes.append(stack.pop()) nodes.append(stack.pop()) - stack.append(Expression(op, nodes[-1], nodes[-2])) + stack.append(Operation(op, nodes[-1], nodes[-2])) elif isinstance(value, list): for item in value: if isinstance(item, ast.AST): @@ -243,8 +246,8 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): ast_traverse(value, col_names, stack, nodes) -def evaluate_expression(Table df, Expression expr): - """Evaluate an Expression on a Table.""" +def evaluate_expression(Table df, Operation expr): + """Evaluate an Operation on a Table.""" result_data = ColumnAccessor() cdef unique_ptr[column] col = libcudf_ast.compute_column( df.view(), diff --git a/python/cudf/cudf/_lib/cpp/ast.pxd b/python/cudf/cudf/_lib/cpp/ast.pxd index 479f42e8df1..dc5e20aa395 100644 --- a/python/cudf/cudf/_lib/cpp/ast.pxd +++ b/python/cudf/cudf/_lib/cpp/ast.pxd @@ -2,18 +2,17 @@ from libcpp.memory cimport unique_ptr -from cudf._lib.cpp.types cimport size_type - +from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.scalar.scalar cimport ( + duration_scalar, numeric_scalar, timestamp_scalar, - duration_scalar ) -from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table_view cimport table_view +from cudf._lib.cpp.types cimport size_type -cdef extern from "cudf/ast/operators.hpp" namespace "cudf::ast" nogil: +cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil: ctypedef enum ast_operator: # Binary operators ADD "cudf::ast::ast_operator::ADD" @@ -26,6 +25,7 @@ cdef extern from "cudf/ast/operators.hpp" namespace "cudf::ast" nogil: PYMOD "cudf::ast::ast_operator::PYMOD" POW "cudf::ast::ast_operator::POW" EQUAL "cudf::ast::ast_operator::EQUAL" + NULL_EQUAL "cudf::ast::ast_operator::NULL_EQUAL" NOT_EQUAL "cudf::ast::ast_operator::NOT_EQUAL" LESS "cudf::ast::ast_operator::LESS" GREATER "cudf::ast::ast_operator::GREATER" @@ -34,7 +34,9 @@ cdef extern from "cudf/ast/operators.hpp" namespace "cudf::ast" nogil: BITWISE_AND "cudf::ast::ast_operator::BITWISE_AND" BITWISE_OR "cudf::ast::ast_operator::BITWISE_OR" BITWISE_XOR "cudf::ast::ast_operator::BITWISE_XOR" + NULL_LOGICAL_AND "cudf::ast::ast_operator::NULL_LOGICAL_AND" LOGICAL_AND "cudf::ast::ast_operator::LOGICAL_AND" + NULL_LOGICAL_OR "cudf::ast::ast_operator::NULL_LOGICAL_OR" LOGICAL_OR "cudf::ast::ast_operator::LOGICAL_OR" # Unary operators IDENTITY "cudf::ast::ast_operator::IDENTITY" @@ -61,35 +63,32 @@ cdef extern from "cudf/ast/operators.hpp" namespace "cudf::ast" nogil: BIT_INVERT "cudf::ast::ast_operator::BIT_INVERT" NOT "cudf::ast::ast_operator::NOT" -cdef extern from "cudf/ast/detail/linearizer.hpp" \ - namespace "cudf::ast::detail" nogil: - cdef cppclass node: + cdef cppclass expression: pass -cdef extern from "cudf/ast/nodes.hpp" namespace "cudf::ast" nogil: ctypedef enum table_reference: LEFT "cudf::ast::table_reference::LEFT" RIGHT "cudf::ast::table_reference::RIGHT" OUTPUT "cudf::ast::table_reference::OUTPUT" - cdef cppclass literal(node): + cdef cppclass literal(expression): # Due to https://github.com/cython/cython/issues/3198, we need to # specify a return type for templated constructors. literal literal[T](numeric_scalar[T] &) except + literal literal[T](timestamp_scalar[T] &) except + literal literal[T](duration_scalar[T] &) except + - cdef cppclass column_reference(node): + cdef cppclass column_reference(expression): # Allow for default C++ parameters by declaring multiple constructors # with the default parameters optionally omitted. column_reference(size_type) except + column_reference(size_type, table_reference) except + - cdef cppclass expression(node): - expression(ast_operator, const node &) - expression(ast_operator, const node&, const node&) + cdef cppclass expression(expression): + expression(ast_operator, const expression &) + expression(ast_operator, const expression&, const expression&) -cdef extern from "cudf/ast/transform.hpp" namespace "cudf::ast" nogil: +cdef extern from "cudf/transform.hpp" namespace "cudf" nogil: cdef unique_ptr[column] compute_column( const table_view table, const expression &expr From 971a5fd1d2693a92998cb6f249549fdf5c5607be Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 25 Aug 2021 15:11:23 -0700 Subject: [PATCH 29/69] Fix compilation errors. --- python/cudf/cudf/_lib/ast.pyx | 25 ++++++++++++------------- python/cudf/cudf/_lib/cpp/ast.pxd | 12 ++++++------ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 7cdb5a46916..60e1a017912 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -30,7 +30,7 @@ class ASTOperator(Enum): PYMOD = libcudf_ast.ast_operator.PYMOD POW = libcudf_ast.ast_operator.POW EQUAL = libcudf_ast.ast_operator.EQUAL - NULL_EQUAL = libcudf_ast.ast_operator.NULL_EQUAL + # NULL_EQUAL = libcudf_ast.ast_operator.NULL_EQUAL NOT_EQUAL = libcudf_ast.ast_operator.NOT_EQUAL LESS = libcudf_ast.ast_operator.LESS GREATER = libcudf_ast.ast_operator.GREATER @@ -40,9 +40,9 @@ class ASTOperator(Enum): BITWISE_OR = libcudf_ast.ast_operator.BITWISE_OR BITWISE_XOR = libcudf_ast.ast_operator.BITWISE_XOR LOGICAL_AND = libcudf_ast.ast_operator.LOGICAL_AND - NULL_LOGICAL_AND = libcudf_ast.ast_operator.NULL_LOGICAL_AND + # NULL_LOGICAL_AND = libcudf_ast.ast_operator.NULL_LOGICAL_AND LOGICAL_OR = libcudf_ast.ast_operator.LOGICAL_OR - NULL_LOGICAL_OR = libcudf_ast.ast_operator.NULL_LOGICAL_OR + # NULL_LOGICAL_OR = libcudf_ast.ast_operator.NULL_LOGICAL_OR # Unary operators IDENTITY = libcudf_ast.ast_operator.IDENTITY SIN = libcudf_ast.ast_operator.SIN @@ -80,14 +80,14 @@ cdef class Literal(Expression): # TODO: Generalize this to other types of literals. cdef int val = value self.c_scalar = make_unique[numeric_scalar[int64_t]](val, True) - self.c_obj = make_unique[ + self.c_obj = make_unique[ libcudf_ast.literal]( dereference(self.c_scalar)) cdef class ColumnReference(Expression): def __cinit__(self, size_type index): - self.c_obj = make_unique[ + self.c_obj = make_unique[ libcudf_ast.column_reference](index) @@ -100,11 +100,11 @@ cdef class Operation(Expression): op.value) if right is None: - self.c_obj = make_unique[ - libcudf_ast.expression](op_value, dereference(left.c_obj)) + self.c_obj = make_unique[ + libcudf_ast.operation](op_value, dereference(left.c_obj)) else: - self.c_obj = make_unique[ - libcudf_ast.expression]( + self.c_obj = make_unique[ + libcudf_ast.operation]( op_value, dereference(left.c_obj), dereference(right.c_obj) ) @@ -248,14 +248,13 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): def evaluate_expression(Table df, Operation expr): """Evaluate an Operation on a Table.""" - result_data = ColumnAccessor() cdef unique_ptr[column] col = libcudf_ast.compute_column( df.view(), dereference(expr.c_obj.get()) ) - result_data['result'] = Column.from_unique_ptr(move(col)) - result_table = Table(data=result_data) - return DataFrame._from_table(result_table) + return DataFrame._from_data( + {'result': Column.from_unique_ptr(move(col))} + ) def make_and_evaluate_expression(expr, df): diff --git a/python/cudf/cudf/_lib/cpp/ast.pxd b/python/cudf/cudf/_lib/cpp/ast.pxd index dc5e20aa395..fa1305809e5 100644 --- a/python/cudf/cudf/_lib/cpp/ast.pxd +++ b/python/cudf/cudf/_lib/cpp/ast.pxd @@ -25,7 +25,7 @@ cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil: PYMOD "cudf::ast::ast_operator::PYMOD" POW "cudf::ast::ast_operator::POW" EQUAL "cudf::ast::ast_operator::EQUAL" - NULL_EQUAL "cudf::ast::ast_operator::NULL_EQUAL" + # NULL_EQUAL "cudf::ast::ast_operator::NULL_EQUAL" NOT_EQUAL "cudf::ast::ast_operator::NOT_EQUAL" LESS "cudf::ast::ast_operator::LESS" GREATER "cudf::ast::ast_operator::GREATER" @@ -34,9 +34,9 @@ cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil: BITWISE_AND "cudf::ast::ast_operator::BITWISE_AND" BITWISE_OR "cudf::ast::ast_operator::BITWISE_OR" BITWISE_XOR "cudf::ast::ast_operator::BITWISE_XOR" - NULL_LOGICAL_AND "cudf::ast::ast_operator::NULL_LOGICAL_AND" + # NULL_LOGICAL_AND "cudf::ast::ast_operator::NULL_LOGICAL_AND" LOGICAL_AND "cudf::ast::ast_operator::LOGICAL_AND" - NULL_LOGICAL_OR "cudf::ast::ast_operator::NULL_LOGICAL_OR" + # NULL_LOGICAL_OR "cudf::ast::ast_operator::NULL_LOGICAL_OR" LOGICAL_OR "cudf::ast::ast_operator::LOGICAL_OR" # Unary operators IDENTITY "cudf::ast::ast_operator::IDENTITY" @@ -84,9 +84,9 @@ cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil: column_reference(size_type) except + column_reference(size_type, table_reference) except + - cdef cppclass expression(expression): - expression(ast_operator, const expression &) - expression(ast_operator, const expression&, const expression&) + cdef cppclass operation(expression): + operation(ast_operator, const expression &) + operation(ast_operator, const expression&, const expression&) cdef extern from "cudf/transform.hpp" namespace "cudf" nogil: cdef unique_ptr[column] compute_column( From a99d9b61ae1e90cccb29c87b79b358af16dc6396 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 25 Aug 2021 15:51:05 -0700 Subject: [PATCH 30/69] Various cleanup tasks. --- python/cudf/cudf/_lib/ast.pyx | 35 ++++++++++--------------------- python/cudf/cudf/_lib/cpp/ast.pxd | 1 - 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 60e1a017912..63a7e3d75e7 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -3,7 +3,6 @@ import ast from enum import Enum -from cudf.core.column_accessor import ColumnAccessor from cudf.core.dataframe import DataFrame from cython.operator cimport dereference @@ -72,7 +71,6 @@ class ASTOperator(Enum): class TableReference(Enum): LEFT = libcudf_ast.table_reference.LEFT RIGHT = libcudf_ast.table_reference.RIGHT - OUTPUT = libcudf_ast.table_reference.OUTPUT cdef class Literal(Expression): @@ -207,18 +205,19 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): for field in root._fields: value = getattr(root, field) if isinstance(value, ast.UnaryOp): - # TODO: I think here we can optimize by just calling on - # value.operand, need to verify. + # Faster to directly parse the operand and skip the op. ast_traverse(value.operand, col_names, stack, nodes) op = python_cudf_ast_map[type(value.op)] nodes.append(stack.pop()) stack.append(Operation(op, nodes[-1])) - # TODO: The expected behavior of the below is questionable. pandas - # converts bitwise operators in query to logical operators, which I - # don't think we want. This might have to be a preprocessing of the - # expression string in df.query that replaces `|` with `or` and `&` - # with `and`. - elif isinstance(value, (ast.BinOp, ast.BoolOp)): + elif isinstance(value, (ast.BinOp, ast.BoolOp, ast.Compare)): + if ( + isinstance(value, ast.Compare) + and len(value.comparators) != 1 + ): + # TODO: Can relax this requirement by unpacking the + # comparison into multiple. + raise ValueError("Only binary comparisons are supported.") ast_traverse(value, col_names, stack, nodes) op = python_cudf_ast_map[type(value.op)] # TODO: This assumes that left is parsed before right, should @@ -226,18 +225,6 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): nodes.append(stack.pop()) nodes.append(stack.pop()) stack.append(Operation(op, nodes[-1], nodes[-2])) - elif isinstance(value, ast.Compare): - if len(value.comparators) != 1: - # TODO: Can relax this comparison by unpacking the - # comparison into multiple. - raise ValueError("Only binary comparisons are supported.") - ast_traverse(value, col_names, stack, nodes) - op = python_cudf_ast_map[type(value.ops[0])] - # TODO: This assumes that left is parsed before comparators, - # should maybe handle this more explicitly. - nodes.append(stack.pop()) - nodes.append(stack.pop()) - stack.append(Operation(op, nodes[-1], nodes[-2])) elif isinstance(value, list): for item in value: if isinstance(item, ast.AST): @@ -246,8 +233,8 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): ast_traverse(value, col_names, stack, nodes) -def evaluate_expression(Table df, Operation expr): - """Evaluate an Operation on a Table.""" +def evaluate_expression(Table df, Expression expr): + """Evaluate an Expression on a Table.""" cdef unique_ptr[column] col = libcudf_ast.compute_column( df.view(), dereference(expr.c_obj.get()) diff --git a/python/cudf/cudf/_lib/cpp/ast.pxd b/python/cudf/cudf/_lib/cpp/ast.pxd index fa1305809e5..3b4944371a5 100644 --- a/python/cudf/cudf/_lib/cpp/ast.pxd +++ b/python/cudf/cudf/_lib/cpp/ast.pxd @@ -69,7 +69,6 @@ cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil: ctypedef enum table_reference: LEFT "cudf::ast::table_reference::LEFT" RIGHT "cudf::ast::table_reference::RIGHT" - OUTPUT "cudf::ast::table_reference::OUTPUT" cdef cppclass literal(expression): # Due to https://github.com/cython/cython/issues/3198, we need to From 963683fcdecb4145ecff8e1df22681d3ba9d0c26 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 7 Apr 2022 17:53:28 -0700 Subject: [PATCH 31/69] Update to stop using deprecated APIs and update copyright years. --- cpp/include/cudf/ast/expressions.hpp | 2 +- python/cudf/cudf/_lib/ast.pxd | 2 +- python/cudf/cudf/_lib/ast.pyx | 17 +++++++++-------- python/cudf/cudf/_lib/cpp/ast.pxd | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp index 5a176ebacf3..96c99e054a5 100644 --- a/cpp/include/cudf/ast/expressions.hpp +++ b/cpp/include/cudf/ast/expressions.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/python/cudf/cudf/_lib/ast.pxd b/python/cudf/cudf/_lib/ast.pxd index 9920f45cd2a..e71245d957d 100644 --- a/python/cudf/cudf/_lib/ast.pxd +++ b/python/cudf/cudf/_lib/ast.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2022, NVIDIA CORPORATION. from libc.stdint cimport int32_t, int64_t from libcpp.memory cimport unique_ptr diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 63a7e3d75e7..1a7e85510d1 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2022, NVIDIA CORPORATION. import ast from enum import Enum @@ -14,8 +14,9 @@ cimport cudf._lib.cpp.ast as libcudf_ast from cudf._lib.ast cimport underlying_type_ast_operator from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type -from cudf._lib.table cimport Table +from cudf._lib.utils cimport table_view_from_table class ASTOperator(Enum): @@ -91,9 +92,9 @@ cdef class ColumnReference(Expression): cdef class Operation(Expression): def __cinit__(self, op, Expression left, Expression right=None): - # This awkward double casting appears to be the only way to get Cython - # to generate valid C++ that doesn't try to apply the shift operator - # directly to values of the enum (which is invalid). + # This awkward double casting is the only way to get Cython to generate + # valid C++ that doesn't try to apply the shift operator directly to + # values of the enum (which is invalid). cdef libcudf_ast.ast_operator op_value = ( op.value) @@ -233,10 +234,10 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): ast_traverse(value, col_names, stack, nodes) -def evaluate_expression(Table df, Expression expr): - """Evaluate an Expression on a Table.""" +def evaluate_expression(object df, Expression expr): + """Evaluate an Expression on a DataFrame.""" cdef unique_ptr[column] col = libcudf_ast.compute_column( - df.view(), + table_view_from_table(df), dereference(expr.c_obj.get()) ) return DataFrame._from_data( diff --git a/python/cudf/cudf/_lib/cpp/ast.pxd b/python/cudf/cudf/_lib/cpp/ast.pxd index 3b4944371a5..6cd4532c539 100644 --- a/python/cudf/cudf/_lib/cpp/ast.pxd +++ b/python/cudf/cudf/_lib/cpp/ast.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr From 14b45bbd717c259fddd86d327cec8e7137ded8a7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 7 Apr 2022 18:19:46 -0700 Subject: [PATCH 32/69] Implement basic version of eval. --- python/cudf/cudf/_lib/__init__.py | 3 +- python/cudf/cudf/_lib/ast.pyx | 10 +-- python/cudf/cudf/core/dataframe.py | 102 +++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index bd25aa53405..37f515a8f33 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -1,7 +1,8 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import numpy as np from . import ( + ast, avro, binaryop, concat, diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 1a7e85510d1..3ac22b63bb6 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -3,16 +3,14 @@ import ast from enum import Enum -from cudf.core.dataframe import DataFrame - from cython.operator cimport dereference from libc.stdint cimport int64_t from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move -cimport cudf._lib.cpp.ast as libcudf_ast from cudf._lib.ast cimport underlying_type_ast_operator from cudf._lib.column cimport Column +from cudf._lib.cpp cimport ast as libcudf_ast from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type @@ -240,12 +238,10 @@ def evaluate_expression(object df, Expression expr): table_view_from_table(df), dereference(expr.c_obj.get()) ) - return DataFrame._from_data( - {'result': Column.from_unique_ptr(move(col))} - ) + return {'result': Column.from_unique_ptr(move(col))} -def make_and_evaluate_expression(expr, df): +def make_and_evaluate_expression(df, expr): """Create a cudf evaluable expression from a string and evaluate it.""" # Important: both make and evaluate must be coupled to guarantee that the # nodes created (the owning ColumnReferences and Literals) remain in scope. diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d87cb788a7e..a432077f5fe 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6237,6 +6237,108 @@ def interleave_columns(self): return self._constructor_sliced._from_data( {None: libcudf.reshape.interleave_columns([*self._columns])} + + @_cudf_nvtx_annotate + def eval(self, expr: "str", inplace: "bool" = False, **kwargs): + """Evaluate a string describing operations on DataFrame columns. + + Operates on columns only, not specific rows or elements. This allows + `eval` to run arbitrary code, which can make you vulnerable to code + injection if you pass user input to this function. + + Parameters + ---------- + expr : str + The expression string to evaluate. + inplace : bool, default False + If the expression contains an assignment, whether to perform the + operation inplace and mutate the existing DataFrame. Otherwise, + a new DataFrame is returned. + **kwargs + See the documentation for :func:`eval` for complete details + on the keyword arguments accepted by + :meth:`~pandas.DataFrame.query`. + + Returns + ------- + ndarray, scalar, pandas object, or None + The result of the evaluation or None if ``inplace=True``. + + See Also + -------- + DataFrame.query : Evaluates a boolean expression to query the columns + of a frame. + + Notes + ----- + For more details see the API documentation for :func:`~eval`. + For detailed examples see :ref:`enhancing performance with eval + `. + + Examples + -------- + >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + >>> df.eval('A + B') + 0 11 + 1 10 + 2 9 + 3 8 + 4 7 + dtype: int64 + + Assignment is allowed though by default the original DataFrame is not + modified. + + >>> df.eval('C = A + B') + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + + Use ``inplace=True`` to modify the original DataFrame. + + >>> df.eval('C = A + B', inplace=True) + >>> df + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + + Multiple columns can be assigned to using multi-line expressions: + + >>> df.eval( + ... ''' + ... C = A + B + ... D = A - B + ... ''' + ... ) + A B C D + 0 1 10 11 -9 + 1 2 8 10 -6 + 2 3 6 9 -3 + 3 4 4 8 0 + 4 5 2 7 3 + """ + return self._from_data( + libcudf.ast.make_and_evaluate_expression(self, expr) ) From 2c8b5dce0a3062e6ae50f1fbe880c1f539a8871d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 7 Apr 2022 18:33:29 -0700 Subject: [PATCH 33/69] Fix implementation of comparison ops. --- python/cudf/cudf/_lib/ast.pyx | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 3ac22b63bb6..7c2e490fa46 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -209,16 +209,22 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): op = python_cudf_ast_map[type(value.op)] nodes.append(stack.pop()) stack.append(Operation(op, nodes[-1])) - elif isinstance(value, (ast.BinOp, ast.BoolOp, ast.Compare)): - if ( - isinstance(value, ast.Compare) - and len(value.comparators) != 1 - ): + elif isinstance(value, (ast.BinOp, ast.BoolOp)): + ast_traverse(value, col_names, stack, nodes) + op = python_cudf_ast_map[type(value.op)] + # TODO: This assumes that left is parsed before right, should + # maybe handle this more explicitly. + nodes.append(stack.pop()) + nodes.append(stack.pop()) + stack.append(Operation(op, nodes[-1], nodes[-2])) + elif isinstance(value, ast.Compare): + if len(value.ops) != 1: # TODO: Can relax this requirement by unpacking the # comparison into multiple. raise ValueError("Only binary comparisons are supported.") - ast_traverse(value, col_names, stack, nodes) - op = python_cudf_ast_map[type(value.op)] + ast_traverse(value.left, col_names, stack, nodes) + ast_traverse(value.comparators[0], col_names, stack, nodes) + op = python_cudf_ast_map[type(value.ops[0])] # TODO: This assumes that left is parsed before right, should # maybe handle this more explicitly. nodes.append(stack.pop()) From cdb3b25ecdb96918cbcaea77207eafa83e492a6c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Apr 2022 14:57:33 -0700 Subject: [PATCH 34/69] Add some basic tests of eval. --- python/cudf/cudf/_lib/ast.pyx | 8 +++---- python/cudf/cudf/tests/test_dataframe.py | 27 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 7c2e490fa46..c894521ca3c 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -222,11 +222,11 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): # TODO: Can relax this requirement by unpacking the # comparison into multiple. raise ValueError("Only binary comparisons are supported.") - ast_traverse(value.left, col_names, stack, nodes) - ast_traverse(value.comparators[0], col_names, stack, nodes) + ast_traverse(value, col_names, stack, nodes) op = python_cudf_ast_map[type(value.ops[0])] - # TODO: This assumes that left is parsed before right, should - # maybe handle this more explicitly. + # TODO: This makes assumptions about the field ordering (left, + # ops, comparators) in the above parsing, should maybe handle + # this more explicitly. nodes.append(stack.pop()) nodes.append(stack.pop()) stack.append(Operation(op, nodes[-1], nodes[-2])) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 957277d7f9b..2d68f3e9f37 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9268,3 +9268,30 @@ def test_empty_numeric_only(data): expected = pdf.prod(numeric_only=True) actual = gdf.prod(numeric_only=True) assert_eq(expected, actual) + + +@pytest.fixture +def df_eval(): + N = int(10) + rng = cupy.random.default_rng(0) + return cudf.DataFrame( + { + "a": rng.random(N), + "b": rng.random(N), + "c": rng.random(N), + "d": rng.random(N), + } + ) + + +@pytest.mark.parametrize( + "expr", + [ + "df.a + df.b", + "df.a * df.b", + "df.a > df.b", + ], +) +def test_dataframe_eval(df_eval, expr): + expr_stripped = expr.replace("df.", "") + assert eval(expr, {"df": df_eval}) == df_eval.eval(expr_stripped) From cce59839ed939e43eddf9f1d63efa5baad913bba Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Apr 2022 15:42:58 -0700 Subject: [PATCH 35/69] Add support for chained operators. --- python/cudf/cudf/_lib/ast.pyx | 51 ++++++++++++++++++------ python/cudf/cudf/tests/test_dataframe.py | 5 ++- 2 files changed, 43 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index c894521ca3c..015e795aff8 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -1,6 +1,7 @@ # Copyright (c) 2022, NVIDIA CORPORATION. import ast +import functools from enum import Enum from cython.operator cimport dereference @@ -218,18 +219,44 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): nodes.append(stack.pop()) stack.append(Operation(op, nodes[-1], nodes[-2])) elif isinstance(value, ast.Compare): - if len(value.ops) != 1: - # TODO: Can relax this requirement by unpacking the - # comparison into multiple. - raise ValueError("Only binary comparisons are supported.") - ast_traverse(value, col_names, stack, nodes) - op = python_cudf_ast_map[type(value.ops[0])] - # TODO: This makes assumptions about the field ordering (left, - # ops, comparators) in the above parsing, should maybe handle - # this more explicitly. - nodes.append(stack.pop()) - nodes.append(stack.pop()) - stack.append(Operation(op, nodes[-1], nodes[-2])) + if len(value.ops) > 1: + # Chained comparators should be split into multiple sets of + # comparators. Parsing occurs left to right. + operands = (value.left, *value.comparators) + inner_ops = [] + for i, op in enumerate(value.ops): + # Note that this will lead to duplicate nodes, e.g. if + # the comparison is `a < b < c` that will be encoded as + # `a < b and b < c`. + comp = ast.Compare(operands[i], op, operands[i+1]) + ast_traverse(comp, col_names, stack, nodes) + op = python_cudf_ast_map[type(op)] + nodes.append(stack.pop()) + nodes.append(stack.pop()) + inner_ops.append(Operation(op, nodes[-1], nodes[-2])) + nodes.append(inner_ops[-1]) + + # Need to make sure the first element is also included. + op = ASTOperator.LOGICAL_AND + + def _combine_compare_ops(left, right): + nodes.append(Operation(op, left, right)) + return nodes[-1] + + # Note that the return value has already been added to the + # list of nodes, so we don't need to save it here and can + # just push `nodes[-1]` onto the stack. + functools.reduce(_combine_compare_ops, inner_ops) + stack.append(nodes[-1]) + else: + ast_traverse(value, col_names, stack, nodes) + op = python_cudf_ast_map[type(value.ops[0])] + # TODO: This makes assumptions about the field ordering + # (left, ops, comparators) in the above parsing, should + # maybe handle this more explicitly. + nodes.append(stack.pop()) + nodes.append(stack.pop()) + stack.append(Operation(op, nodes[-1], nodes[-2])) elif isinstance(value, list): for item in value: if isinstance(item, ast.AST): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 2d68f3e9f37..f64aaf3a643 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9290,8 +9290,11 @@ def df_eval(): "df.a + df.b", "df.a * df.b", "df.a > df.b", + "df.a > df.b > df.c", ], ) def test_dataframe_eval(df_eval, expr): expr_stripped = expr.replace("df.", "") - assert eval(expr, {"df": df_eval}) == df_eval.eval(expr_stripped) + assert pd.eval(expr, local_dict={"df": df_eval}) == df_eval.eval( + expr_stripped + ) From dc2f73563c0968b4fe058a712b7306cb44df375f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Apr 2022 15:52:36 -0700 Subject: [PATCH 36/69] Remove reliance on implicit field ordering. --- python/cudf/cudf/_lib/ast.pyx | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 015e795aff8..a8c0464450c 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -211,10 +211,17 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): nodes.append(stack.pop()) stack.append(Operation(op, nodes[-1])) elif isinstance(value, (ast.BinOp, ast.BoolOp)): - ast_traverse(value, col_names, stack, nodes) op = python_cudf_ast_map[type(value.op)] - # TODO: This assumes that left is parsed before right, should - # maybe handle this more explicitly. + + # Note that since the fields of `value` are ordered, a single + # call to `ast_traverse(value, ...)` would have the same effect + # as explicitly invoking it on `left` and `right` (the call + # will have no effect on the operator), but we have no + # guarantee that the fields or their ordering will not change + # in future Python versions so it's best to be explicit. + ast_traverse(value.left, col_names, stack, nodes) + ast_traverse(value.right, col_names, stack, nodes) + nodes.append(stack.pop()) nodes.append(stack.pop()) stack.append(Operation(op, nodes[-1], nodes[-2])) @@ -229,8 +236,11 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): # the comparison is `a < b < c` that will be encoded as # `a < b and b < c`. comp = ast.Compare(operands[i], op, operands[i+1]) - ast_traverse(comp, col_names, stack, nodes) op = python_cudf_ast_map[type(op)] + + ast_traverse(comp.left, col_names, stack, nodes) + ast_traverse(comp.comparators, col_names, stack, nodes) + nodes.append(stack.pop()) nodes.append(stack.pop()) inner_ops.append(Operation(op, nodes[-1], nodes[-2])) @@ -249,11 +259,11 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): functools.reduce(_combine_compare_ops, inner_ops) stack.append(nodes[-1]) else: - ast_traverse(value, col_names, stack, nodes) op = python_cudf_ast_map[type(value.ops[0])] - # TODO: This makes assumptions about the field ordering - # (left, ops, comparators) in the above parsing, should - # maybe handle this more explicitly. + + ast_traverse(value.left, col_names, stack, nodes) + ast_traverse(value.comparators[0], col_names, stack, nodes) + nodes.append(stack.pop()) nodes.append(stack.pop()) stack.append(Operation(op, nodes[-1], nodes[-2])) From d298e352d3d95596ca16d8b64d991e855b112fac Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Apr 2022 16:23:40 -0700 Subject: [PATCH 37/69] Unify comparison operator logic to work identically for one or multiple comparators. --- python/cudf/cudf/_lib/ast.pyx | 59 +++++++++++++++-------------------- 1 file changed, 26 insertions(+), 33 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index a8c0464450c..1fe20ddd0db 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -201,6 +201,9 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): elif isinstance(root, ast.Num): stack.append(Literal(root.n)) else: + # Iterating over `_fields` is _much_ faster than calling + # `iter_child_nodes`. That may not matter for practical data sizes + # though, so we'll need to benchmark. # for value in ast.iter_child_nodes(root): for field in root._fields: value = getattr(root, field) @@ -226,47 +229,37 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): nodes.append(stack.pop()) stack.append(Operation(op, nodes[-1], nodes[-2])) elif isinstance(value, ast.Compare): + # Chained comparators should be split into multiple sets of + # comparators. Parsing occurs left to right. + operands = (value.left, *value.comparators) + inner_ops = [] + for i, op in enumerate(value.ops): + # Note that this will lead to duplicate nodes, e.g. if + # the comparison is `a < b < c` that will be encoded as + # `a < b and b < c`. + comp = ast.Compare(operands[i], op, operands[i+1]) + op = python_cudf_ast_map[type(op)] + + ast_traverse(comp.left, col_names, stack, nodes) + ast_traverse(comp.comparators, col_names, stack, nodes) + + nodes.append(stack.pop()) + nodes.append(stack.pop()) + inner_ops.append(Operation(op, nodes[-1], nodes[-2])) + nodes.append(inner_ops[-1]) + + # If we have more than one comparator, we need to link them + # together with a bunch of LOGICAL_AND operators. if len(value.ops) > 1: - # Chained comparators should be split into multiple sets of - # comparators. Parsing occurs left to right. - operands = (value.left, *value.comparators) - inner_ops = [] - for i, op in enumerate(value.ops): - # Note that this will lead to duplicate nodes, e.g. if - # the comparison is `a < b < c` that will be encoded as - # `a < b and b < c`. - comp = ast.Compare(operands[i], op, operands[i+1]) - op = python_cudf_ast_map[type(op)] - - ast_traverse(comp.left, col_names, stack, nodes) - ast_traverse(comp.comparators, col_names, stack, nodes) - - nodes.append(stack.pop()) - nodes.append(stack.pop()) - inner_ops.append(Operation(op, nodes[-1], nodes[-2])) - nodes.append(inner_ops[-1]) - - # Need to make sure the first element is also included. op = ASTOperator.LOGICAL_AND def _combine_compare_ops(left, right): - nodes.append(Operation(op, left, right)) + nodes.append(Operation(, left, right)) return nodes[-1] - # Note that the return value has already been added to the - # list of nodes, so we don't need to save it here and can - # just push `nodes[-1]` onto the stack. functools.reduce(_combine_compare_ops, inner_ops) - stack.append(nodes[-1]) - else: - op = python_cudf_ast_map[type(value.ops[0])] - ast_traverse(value.left, col_names, stack, nodes) - ast_traverse(value.comparators[0], col_names, stack, nodes) - - nodes.append(stack.pop()) - nodes.append(stack.pop()) - stack.append(Operation(op, nodes[-1], nodes[-2])) + stack.append(nodes[-1]) elif isinstance(value, list): for item in value: if isinstance(item, ast.AST): From dfdb91ed20706cbef7a90afcfefa4424d6b36d82 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Apr 2022 17:08:26 -0700 Subject: [PATCH 38/69] Handle bool ops correctly relative to binops, fix the return type of eval, and amend tests to work correctly. --- python/cudf/cudf/_lib/ast.pyx | 77 +++++++++++++++++++----- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 20 +++--- 3 files changed, 74 insertions(+), 25 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 1fe20ddd0db..7ad58b39cd0 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -130,8 +130,14 @@ python_cudf_ast_map = { ast.BitAnd: ASTOperator.BITWISE_AND, ast.BitOr: ASTOperator.BITWISE_OR, ast.BitXor: ASTOperator.BITWISE_XOR, - ast.And: ASTOperator.LOGICAL_AND, - ast.Or: ASTOperator.LOGICAL_OR, + # TODO: These maps are wrong, but for pandas compatibility they actually + # need to be dtype-specific so this is just for testing the AST parsing + # logic. Eventually the lookup for these will need to be updated. + ast.And: ASTOperator.BITWISE_AND, + ast.Or: ASTOperator.BITWISE_OR, + # ast.And: ASTOperator.LOGICAL_AND, + # ast.Or: ASTOperator.LOGICAL_OR, + # Unary operators # ast.Identity: ASTOperator.IDENTITY, # ast.Sin: ASTOperator.SIN, @@ -193,6 +199,8 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): evaluation. This list must remain in scope until the expression has been evaluated. """ + # TODO: We'll eventually need to find a way to support operations on mixed + # but compatible dtypes (e.g. adding int to float). # Base cases: Name if isinstance(root, ast.Name): stack.append(ColumnReference(col_names.index(root.id) + 1)) @@ -206,6 +214,14 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): # though, so we'll need to benchmark. # for value in ast.iter_child_nodes(root): for field in root._fields: + # Note that since the fields of `value` are ordered, in some cases + # (e.g. `ast.BinOp`) a single call to `ast_traverse(value, ...)` + # would have the same effect as explicitly invoking it on the + # fields (e.g. `left` and `right` for `ast.BinOp`). However, this + # relies on the fields and their ordering not changing in future + # Python versions, and that we won't change the parsing logic for + # e.g. the operators, so it's best to be explicit in all the + # branches below. value = getattr(root, field) if isinstance(value, ast.UnaryOp): # Faster to directly parse the operand and skip the op. @@ -213,35 +229,64 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): op = python_cudf_ast_map[type(value.op)] nodes.append(stack.pop()) stack.append(Operation(op, nodes[-1])) - elif isinstance(value, (ast.BinOp, ast.BoolOp)): + elif isinstance(value, ast.BinOp): op = python_cudf_ast_map[type(value.op)] - # Note that since the fields of `value` are ordered, a single - # call to `ast_traverse(value, ...)` would have the same effect - # as explicitly invoking it on `left` and `right` (the call - # will have no effect on the operator), but we have no - # guarantee that the fields or their ordering will not change - # in future Python versions so it's best to be explicit. ast_traverse(value.left, col_names, stack, nodes) ast_traverse(value.right, col_names, stack, nodes) nodes.append(stack.pop()) nodes.append(stack.pop()) stack.append(Operation(op, nodes[-1], nodes[-2])) + + # TODO: Whether And/Or and BitAnd/BitOr actually correspond to + # logical or bitwise operators depends on the data types that they + # are applied to. We'll need to add logic to map to that. + elif isinstance(value, ast.BoolOp): + # Chained comparators should be split into multiple sets of + # comparators. Parsing occurs left to right. + inner_ops = [] + for left, right in zip(value.values[:-1], value.values[1:]): + # Note that this will lead to duplicate nodes, e.g. if + # the comparison is `a < b < c` that will be encoded as + # `a < b and b < c`. + op = python_cudf_ast_map[type(value.op)] + + ast_traverse(left, col_names, stack, nodes) + ast_traverse(right, col_names, stack, nodes) + + nodes.append(stack.pop()) + nodes.append(stack.pop()) + inner_ops.append(Operation(op, nodes[-1], nodes[-2])) + nodes.append(inner_ops[-1]) + + # If we have more than one comparator, we need to link them + # together with a bunch of LOGICAL_AND operators. + if len(value.values) > 2: + op = ASTOperator.LOGICAL_AND + + def _combine_compare_ops(left, right): + nodes.append(Operation(op, left, right)) + return nodes[-1] + + functools.reduce(_combine_compare_ops, inner_ops) + + stack.append(nodes[-1]) elif isinstance(value, ast.Compare): # Chained comparators should be split into multiple sets of # comparators. Parsing occurs left to right. operands = (value.left, *value.comparators) inner_ops = [] - for i, op in enumerate(value.ops): + for op, (left, right) in zip( + value.ops, zip(operands[:-1], operands[1:]) + ): # Note that this will lead to duplicate nodes, e.g. if # the comparison is `a < b < c` that will be encoded as # `a < b and b < c`. - comp = ast.Compare(operands[i], op, operands[i+1]) op = python_cudf_ast_map[type(op)] - ast_traverse(comp.left, col_names, stack, nodes) - ast_traverse(comp.comparators, col_names, stack, nodes) + ast_traverse(left, col_names, stack, nodes) + ast_traverse(right, col_names, stack, nodes) nodes.append(stack.pop()) nodes.append(stack.pop()) @@ -250,11 +295,11 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): # If we have more than one comparator, we need to link them # together with a bunch of LOGICAL_AND operators. - if len(value.ops) > 1: + if len(operands) > 2: op = ASTOperator.LOGICAL_AND def _combine_compare_ops(left, right): - nodes.append(Operation(, left, right)) + nodes.append(Operation(op, left, right)) return nodes[-1] functools.reduce(_combine_compare_ops, inner_ops) @@ -274,7 +319,7 @@ def evaluate_expression(object df, Expression expr): table_view_from_table(df), dereference(expr.c_obj.get()) ) - return {'result': Column.from_unique_ptr(move(col))} + return {'None': Column.from_unique_ptr(move(col))} def make_and_evaluate_expression(df, expr): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a432077f5fe..45a1b7143bf 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6337,7 +6337,7 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): 3 4 4 8 0 4 5 2 7 3 """ - return self._from_data( + return Series._from_data( libcudf.ast.make_and_evaluate_expression(self, expr) ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f64aaf3a643..cab2ab3c8db 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9272,14 +9272,15 @@ def test_empty_numeric_only(data): @pytest.fixture def df_eval(): - N = int(10) + N = 10 + int_max = 10 rng = cupy.random.default_rng(0) return cudf.DataFrame( { - "a": rng.random(N), - "b": rng.random(N), - "c": rng.random(N), - "d": rng.random(N), + "a": rng.integers(N, size=int_max), + "b": rng.integers(N, size=int_max), + "c": rng.integers(N, size=int_max), + "d": rng.integers(N, size=int_max), } ) @@ -9291,10 +9292,13 @@ def df_eval(): "df.a * df.b", "df.a > df.b", "df.a > df.b > df.c", + "df.a > df.b < df.c", + "df.a and df.b", + # "df.a and df.b or df.c", ], ) def test_dataframe_eval(df_eval, expr): expr_stripped = expr.replace("df.", "") - assert pd.eval(expr, local_dict={"df": df_eval}) == df_eval.eval( - expr_stripped - ) + expect = pd.eval(expr, local_dict={"df": df_eval.to_pandas()}) + got = df_eval.eval(expr_stripped) + assert got.to_pandas().equals(expect) From 8575706152e61943e6b1aea603c7994dd763576f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Apr 2022 17:18:50 -0700 Subject: [PATCH 39/69] Unify CompareOp and BoolOp branches. --- python/cudf/cudf/_lib/ast.pyx | 50 +++++++++-------------------------- 1 file changed, 13 insertions(+), 37 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 7ad58b39cd0..44a1943af33 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -3,6 +3,7 @@ import ast import functools from enum import Enum +from itertools import repeat from cython.operator cimport dereference from libc.stdint cimport int64_t @@ -242,44 +243,19 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): # TODO: Whether And/Or and BitAnd/BitOr actually correspond to # logical or bitwise operators depends on the data types that they # are applied to. We'll need to add logic to map to that. - elif isinstance(value, ast.BoolOp): - # Chained comparators should be split into multiple sets of - # comparators. Parsing occurs left to right. - inner_ops = [] - for left, right in zip(value.values[:-1], value.values[1:]): - # Note that this will lead to duplicate nodes, e.g. if - # the comparison is `a < b < c` that will be encoded as - # `a < b and b < c`. - op = python_cudf_ast_map[type(value.op)] - - ast_traverse(left, col_names, stack, nodes) - ast_traverse(right, col_names, stack, nodes) - - nodes.append(stack.pop()) - nodes.append(stack.pop()) - inner_ops.append(Operation(op, nodes[-1], nodes[-2])) - nodes.append(inner_ops[-1]) + elif isinstance(value, (ast.BoolOp, ast.Compare)): + if isinstance(value, ast.BoolOp): + operators = repeat(value.op) + operands = zip(value.values[:-1], value.values[1:]) + multiple_ops = len(value.values) > 2 + else: + operators = value.ops + operands = (value.left, *value.comparators) + multiple_ops = len(operands) > 2 + operands = zip(operands[:-1], operands[1:]) - # If we have more than one comparator, we need to link them - # together with a bunch of LOGICAL_AND operators. - if len(value.values) > 2: - op = ASTOperator.LOGICAL_AND - - def _combine_compare_ops(left, right): - nodes.append(Operation(op, left, right)) - return nodes[-1] - - functools.reduce(_combine_compare_ops, inner_ops) - - stack.append(nodes[-1]) - elif isinstance(value, ast.Compare): - # Chained comparators should be split into multiple sets of - # comparators. Parsing occurs left to right. - operands = (value.left, *value.comparators) inner_ops = [] - for op, (left, right) in zip( - value.ops, zip(operands[:-1], operands[1:]) - ): + for op, (left, right) in zip(operators, operands): # Note that this will lead to duplicate nodes, e.g. if # the comparison is `a < b < c` that will be encoded as # `a < b and b < c`. @@ -295,7 +271,7 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): # If we have more than one comparator, we need to link them # together with a bunch of LOGICAL_AND operators. - if len(operands) > 2: + if multiple_ops: op = ASTOperator.LOGICAL_AND def _combine_compare_ops(left, right): From 977a31862c6c8d62713111f8167a9e8daff7be26 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Apr 2022 10:22:25 -0700 Subject: [PATCH 40/69] Enable C++ exception propagation and execution without the GIL. --- python/cudf/cudf/_lib/ast.pyx | 13 +++++++++---- python/cudf/cudf/_lib/cpp/ast.pxd | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 44a1943af33..1ff598895fa 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -291,10 +291,15 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): def evaluate_expression(object df, Expression expr): """Evaluate an Expression on a DataFrame.""" - cdef unique_ptr[column] col = libcudf_ast.compute_column( - table_view_from_table(df), - dereference(expr.c_obj.get()) - ) + cdef unique_ptr[column] col + cdef table_view tbl = table_view_from_table(df) + with nogil: + col = move( + libcudf_ast.compute_column( + tbl, + dereference(expr.c_obj.get()) + ) + ) return {'None': Column.from_unique_ptr(move(col))} diff --git a/python/cudf/cudf/_lib/cpp/ast.pxd b/python/cudf/cudf/_lib/cpp/ast.pxd index 6cd4532c539..426bbba6be4 100644 --- a/python/cudf/cudf/_lib/cpp/ast.pxd +++ b/python/cudf/cudf/_lib/cpp/ast.pxd @@ -91,4 +91,4 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil: cdef unique_ptr[column] compute_column( const table_view table, const expression &expr - ) + ) except + From 5c162f7bb1a89e3f0e78eeb109db4958c7bacc86 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Apr 2022 10:51:47 -0700 Subject: [PATCH 41/69] Add support for unary functions. --- python/cudf/cudf/_lib/ast.pyx | 79 ++++++++++++++++-------- python/cudf/cudf/tests/test_dataframe.py | 22 ++++--- 2 files changed, 66 insertions(+), 35 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 1ff598895fa..66483f727a6 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -110,7 +110,7 @@ cdef class Operation(Expression): # This dictionary encodes the mapping from Python AST operators to their cudf # counterparts. -python_cudf_ast_map = { +python_cudf_operator_map = { # TODO: Mapping TBD for commented out operators. # Binary operators ast.Add: ASTOperator.ADD, @@ -141,31 +141,45 @@ python_cudf_ast_map = { # Unary operators # ast.Identity: ASTOperator.IDENTITY, - # ast.Sin: ASTOperator.SIN, - # ast.Cos: ASTOperator.COS, - # ast.Tan: ASTOperator.TAN, - # ast.Arcsin: ASTOperator.ARCSIN, - # ast.Arccos: ASTOperator.ARCCOS, - # ast.Arctan: ASTOperator.ARCTAN, - # ast.Sinh: ASTOperator.SINH, - # ast.Cosh: ASTOperator.COSH, - # ast.Tanh: ASTOperator.TANH, - # ast.Arcsinh: ASTOperator.ARCSINH, - # ast.Arccosh: ASTOperator.ARCCOSH, - # ast.Arctanh: ASTOperator.ARCTANH, - # ast.Exp: ASTOperator.EXP, - # ast.Log: ASTOperator.LOG, - # ast.Sqrt: ASTOperator.SQRT, - # ast.Cbrt: ASTOperator.CBRT, - # ast.Ceil: ASTOperator.CEIL, - # ast.Floor: ASTOperator.FLOOR, - # ast.Abs: ASTOperator.ABS, - # ast.Rint: ASTOperator.RINT, # ast.Bit: ASTOperator.BIT_INVERT, # ast.Not: ASTOperator.NOT, } +# Mapping between Python function names encode in an ast.Call node and the +# corresponding libcudf C++ AST operators. +python_cudf_function_map = { + # TODO: Operators listed on + # https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#expression-evaluation-via-eval + # that we don't support yet: + # expm1, log1p, arctan2 and log10. + 'sin': ASTOperator.SIN, + 'cos': ASTOperator.COS, + 'tan': ASTOperator.TAN, + 'arcsin': ASTOperator.ARCSIN, + 'arccos': ASTOperator.ARCCOS, + 'arctan': ASTOperator.ARCTAN, + 'sinh': ASTOperator.SINH, + 'cosh': ASTOperator.COSH, + 'tanh': ASTOperator.TANH, + 'arcsinh': ASTOperator.ARCSINH, + 'arccosh': ASTOperator.ARCCOSH, + 'arctanh': ASTOperator.ARCTANH, + 'exp': ASTOperator.EXP, + 'log': ASTOperator.LOG, + 'sqrt': ASTOperator.SQRT, + 'abs': ASTOperator.ABS, + + # TODO: Operators supported by libcudf with no Python analog. + # ast.rint: ASTOperator.RINT, + # ast.cbrt: ASTOperator.CBRT, + # ast.ceil: ASTOperator.CEIL, + # ast.floor: ASTOperator.FLOOR, +} + + +# TODO: Consider wrapping this in another function that caches the result along +# with the nodes so that we can avoid multiple parsings. cdef ast_traverse(root, tuple col_names, list stack, list nodes): """Construct an evaluable libcudf expression by traversing Python AST. @@ -227,11 +241,11 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): if isinstance(value, ast.UnaryOp): # Faster to directly parse the operand and skip the op. ast_traverse(value.operand, col_names, stack, nodes) - op = python_cudf_ast_map[type(value.op)] + op = python_cudf_operator_map[type(value.op)] nodes.append(stack.pop()) stack.append(Operation(op, nodes[-1])) elif isinstance(value, ast.BinOp): - op = python_cudf_ast_map[type(value.op)] + op = python_cudf_operator_map[type(value.op)] ast_traverse(value.left, col_names, stack, nodes) ast_traverse(value.right, col_names, stack, nodes) @@ -259,13 +273,13 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): # Note that this will lead to duplicate nodes, e.g. if # the comparison is `a < b < c` that will be encoded as # `a < b and b < c`. - op = python_cudf_ast_map[type(op)] - ast_traverse(left, col_names, stack, nodes) ast_traverse(right, col_names, stack, nodes) nodes.append(stack.pop()) nodes.append(stack.pop()) + + op = python_cudf_operator_map[type(op)] inner_ops.append(Operation(op, nodes[-1], nodes[-2])) nodes.append(inner_ops[-1]) @@ -281,6 +295,21 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): functools.reduce(_combine_compare_ops, inner_ops) stack.append(nodes[-1]) + elif isinstance(value, ast.Call): + if len(value.args) != 1 or value.keywords: + raise ValueError( + f"Function {value.func} only accepts one " + "positional argument." + ) + try: + op = python_cudf_function_map[value.func.id] + except KeyError: + raise ValueError(f"Unsupported function {value.func}.") + # Assuming only unary functions are supported (checked above). + ast_traverse(value.args[0], col_names, stack, nodes) + + nodes.append(stack.pop()) + stack.append(Operation(op, nodes[-1])) elif isinstance(value, list): for item in value: if isinstance(item, ast.AST): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index cab2ab3c8db..6dc5bd61fa9 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9286,19 +9286,21 @@ def df_eval(): @pytest.mark.parametrize( - "expr", + "expr, dtype", [ - "df.a + df.b", - "df.a * df.b", - "df.a > df.b", - "df.a > df.b > df.c", - "df.a > df.b < df.c", - "df.a and df.b", - # "df.a and df.b or df.c", + ("df.a + df.b", int), + ("df.a * df.b", int), + ("df.a > df.b", int), + ("df.a > df.b > df.c", int), + ("df.a > df.b < df.c", int), + ("df.a and df.b", int), + # ("df.a and df.b or df.c", int), + ("sin(df.a)", float), ], ) -def test_dataframe_eval(df_eval, expr): +def test_dataframe_eval(df_eval, expr, dtype): + df_eval = df_eval.astype(dtype) expr_stripped = expr.replace("df.", "") expect = pd.eval(expr, local_dict={"df": df_eval.to_pandas()}) got = df_eval.eval(expr_stripped) - assert got.to_pandas().equals(expect) + assert_eq(expect, got, check_names=False) From b11924dcbb8b88e4fcbe7b56fecc52e2b77d43e8 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Apr 2022 11:23:46 -0700 Subject: [PATCH 42/69] Improve error handling for constants. --- python/cudf/cudf/_lib/ast.pyx | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 66483f727a6..f70433afad4 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -221,8 +221,14 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): stack.append(ColumnReference(col_names.index(root.id) + 1)) # Note: in Python > 3.7 ast.Num is a subclass of ast.Constant. We may need # to generalize this code eventually if that inheritance is removed. - elif isinstance(root, ast.Num): - stack.append(Literal(root.n)) + elif isinstance(root, ast.Constant): + try: + stack.append(Literal(root.n)) + except TypeError: + raise ValueError( + f"Only numeric constants are supported, received constant " + f"{repr(root.value)} of type {type(root.value)}." + ) else: # Iterating over `_fields` is _much_ faster than calling # `iter_child_nodes`. That may not matter for practical data sizes From 2379c4d397dc78a9baedc3e2db3bd0f5c36ce195 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Apr 2022 11:45:23 -0700 Subject: [PATCH 43/69] Simplify testing. --- python/cudf/cudf/tests/test_dataframe.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 6dc5bd61fa9..a967fd953b9 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9288,19 +9288,21 @@ def df_eval(): @pytest.mark.parametrize( "expr, dtype", [ - ("df.a + df.b", int), - ("df.a * df.b", int), - ("df.a > df.b", int), - ("df.a > df.b > df.c", int), - ("df.a > df.b < df.c", int), - ("df.a and df.b", int), - # ("df.a and df.b or df.c", int), - ("sin(df.a)", float), + ("a + b", int), + ("a * b", int), + ("a > b", int), + ("a > b > c", int), + ("a > b < c", int), + ("a and b", int), + # ("a and b or c", int), + ("sin(a)", float), ], ) def test_dataframe_eval(df_eval, expr, dtype): df_eval = df_eval.astype(dtype) - expr_stripped = expr.replace("df.", "") - expect = pd.eval(expr, local_dict={"df": df_eval.to_pandas()}) - got = df_eval.eval(expr_stripped) + expect = df_eval.to_pandas().eval(expr) + got = df_eval.eval(expr) + # In the specific case where the evaluated expression is a unary function + # of a single column with no nesting, pandas will retain the name. This + # level of compatibility is out of scope for now. assert_eq(expect, got, check_names=False) From 53ed0fdd7cf8cd540062fc70262d53b8d722ddaf Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Apr 2022 12:17:13 -0700 Subject: [PATCH 44/69] Some cleanup. --- python/cudf/cudf/_lib/ast.pyx | 25 ++++++++------ python/cudf/cudf/core/dataframe.py | 42 +++++------------------- python/cudf/cudf/tests/test_dataframe.py | 4 +-- 3 files changed, 25 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index f70433afad4..78f5cdfbd51 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -74,9 +74,11 @@ class TableReference(Enum): RIGHT = libcudf_ast.table_reference.RIGHT +# Note that this function only currently supports numeric literals. libcudf +# expressions don't really support other types yet though, so this isn't +# restrictive at the moment. cdef class Literal(Expression): def __cinit__(self, value): - # TODO: Generalize this to other types of literals. cdef int val = value self.c_scalar = make_unique[numeric_scalar[int64_t]](val, True) self.c_obj = make_unique[ @@ -128,16 +130,19 @@ python_cudf_operator_map = { ast.Gt: ASTOperator.GREATER, ast.LtE: ASTOperator.LESS_EQUAL, ast.GtE: ASTOperator.GREATER_EQUAL, + ast.BitXor: ASTOperator.BITWISE_XOR, + # TODO: The mapping of logical/bitwise operators here is inconsistent with + # pandas. In pandas, Both `BitAnd` and `And` map to + # `ASTOperator.LOGICAL_AND` for booleans, while they map to + # `ASTOperator.BITWISE_AND` for integers. However, there is no good way to + # encode this at present because expressions can be arbitrarily nested so + # we won't know the dtype of the input without inserting a much more + # complex traversal of the expression tree to determine the output types at + # each node. For now, we'll rely on users to use the appropriate operator. ast.BitAnd: ASTOperator.BITWISE_AND, ast.BitOr: ASTOperator.BITWISE_OR, - ast.BitXor: ASTOperator.BITWISE_XOR, - # TODO: These maps are wrong, but for pandas compatibility they actually - # need to be dtype-specific so this is just for testing the AST parsing - # logic. Eventually the lookup for these will need to be updated. - ast.And: ASTOperator.BITWISE_AND, - ast.Or: ASTOperator.BITWISE_OR, - # ast.And: ASTOperator.LOGICAL_AND, - # ast.Or: ASTOperator.LOGICAL_OR, + ast.And: ASTOperator.LOGICAL_AND, + ast.Or: ASTOperator.LOGICAL_OR, # Unary operators # ast.Identity: ASTOperator.IDENTITY, @@ -335,7 +340,7 @@ def evaluate_expression(object df, Expression expr): dereference(expr.c_obj.get()) ) ) - return {'None': Column.from_unique_ptr(move(col))} + return {None: Column.from_unique_ptr(move(col))} def make_and_evaluate_expression(df, expr): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 45a1b7143bf..3fd3ebf5463 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6264,16 +6264,16 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): ndarray, scalar, pandas object, or None The result of the evaluation or None if ``inplace=True``. - See Also - -------- - DataFrame.query : Evaluates a boolean expression to query the columns - of a frame. - Notes ----- - For more details see the API documentation for :func:`~eval`. - For detailed examples see :ref:`enhancing performance with eval - `. + Difference from pandas: + * The inplace argument or additional kwargs are not supported. + * Bitwise and logical operators are not dtype-dependent. + Specifically, `&` must be used for bitwise operators on integers, + not `and`, which is specifically for the logical and between + booleans. + * String columns are not yet supported. + * Only numerical literals are supported. Examples -------- @@ -6310,32 +6310,6 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): 2 3 6 3 4 4 4 5 2 - - Use ``inplace=True`` to modify the original DataFrame. - - >>> df.eval('C = A + B', inplace=True) - >>> df - A B C - 0 1 10 11 - 1 2 8 10 - 2 3 6 9 - 3 4 4 8 - 4 5 2 7 - - Multiple columns can be assigned to using multi-line expressions: - - >>> df.eval( - ... ''' - ... C = A + B - ... D = A - B - ... ''' - ... ) - A B C D - 0 1 10 11 -9 - 1 2 8 10 -6 - 2 3 6 9 -3 - 3 4 4 8 0 - 4 5 2 7 3 """ return Series._from_data( libcudf.ast.make_and_evaluate_expression(self, expr) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index a967fd953b9..65dd0564ef6 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9293,8 +9293,8 @@ def df_eval(): ("a > b", int), ("a > b > c", int), ("a > b < c", int), - ("a and b", int), - # ("a and b or c", int), + ("a & b", int), + # ("a & b | c", int), ("sin(a)", float), ], ) From efe03cfd8f2d338a9e069624f0aed3b37f5c563b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Apr 2022 13:32:25 -0700 Subject: [PATCH 45/69] Fix recursive cases and add more tests. --- python/cudf/cudf/_lib/ast.pyx | 181 ++++++++++------------- python/cudf/cudf/tests/test_dataframe.py | 7 +- 2 files changed, 87 insertions(+), 101 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 78f5cdfbd51..5bcb998408a 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -223,110 +223,91 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): # but compatible dtypes (e.g. adding int to float). # Base cases: Name if isinstance(root, ast.Name): - stack.append(ColumnReference(col_names.index(root.id) + 1)) + try: + col_id = col_names.index(root.id) + 1 + except ValueError: + raise ValueError(f"Unknown column name {root.id}") + stack.append(ColumnReference(col_id)) # Note: in Python > 3.7 ast.Num is a subclass of ast.Constant. We may need # to generalize this code eventually if that inheritance is removed. - elif isinstance(root, ast.Constant): + elif isinstance(root, ast.Num): + stack.append(Literal(root.n)) + elif isinstance(root, ast.UnaryOp): + # Faster to directly parse the operand and skip the op. + ast_traverse(root.operand, col_names, stack, nodes) + op = python_cudf_operator_map[type(root.op)] + nodes.append(stack.pop()) + stack.append(Operation(op, nodes[-1])) + elif isinstance(root, ast.BinOp): + op = python_cudf_operator_map[type(root.op)] + + ast_traverse(root.left, col_names, stack, nodes) + ast_traverse(root.right, col_names, stack, nodes) + + nodes.append(stack.pop()) + nodes.append(stack.pop()) + stack.append(Operation(op, nodes[-1], nodes[-2])) + + # TODO: Whether And/Or and BitAnd/BitOr actually correspond to + # logical or bitwise operators depends on the data types that they + # are applied to. We'll need to add logic to map to that. + elif isinstance(root, (ast.BoolOp, ast.Compare)): + if isinstance(root, ast.BoolOp): + operators = repeat(root.op) + operands = zip(root.values[:-1], root.values[1:]) + multiple_ops = len(root.values) > 2 + else: + operators = root.ops + operands = (root.left, *root.comparators) + multiple_ops = len(operands) > 2 + operands = zip(operands[:-1], operands[1:]) + + inner_ops = [] + for op, (left, right) in zip(operators, operands): + # Note that this will lead to duplicate nodes, e.g. if + # the comparison is `a < b < c` that will be encoded as + # `a < b and b < c`. + ast_traverse(left, col_names, stack, nodes) + ast_traverse(right, col_names, stack, nodes) + + nodes.append(stack.pop()) + nodes.append(stack.pop()) + + op = python_cudf_operator_map[type(op)] + inner_ops.append(Operation(op, nodes[-1], nodes[-2])) + nodes.append(inner_ops[-1]) + + # If we have more than one comparator, we need to link them + # together with a bunch of LOGICAL_AND operators. + if multiple_ops: + op = ASTOperator.LOGICAL_AND + + def _combine_compare_ops(left, right): + nodes.append(Operation(op, left, right)) + return nodes[-1] + + functools.reduce(_combine_compare_ops, inner_ops) + + stack.append(nodes[-1]) + elif isinstance(root, ast.Call): try: - stack.append(Literal(root.n)) - except TypeError: + op = python_cudf_function_map[root.func.id] + except KeyError: + raise ValueError(f"Unsupported function {root.func}.") + # Assuming only unary functions are supported, which is checked above. + if len(root.args) != 1 or root.keywords: raise ValueError( - f"Only numeric constants are supported, received constant " - f"{repr(root.value)} of type {type(root.value)}." + f"Function {root.func} only accepts one positional " + "argument." ) - else: - # Iterating over `_fields` is _much_ faster than calling - # `iter_child_nodes`. That may not matter for practical data sizes - # though, so we'll need to benchmark. - # for value in ast.iter_child_nodes(root): - for field in root._fields: - # Note that since the fields of `value` are ordered, in some cases - # (e.g. `ast.BinOp`) a single call to `ast_traverse(value, ...)` - # would have the same effect as explicitly invoking it on the - # fields (e.g. `left` and `right` for `ast.BinOp`). However, this - # relies on the fields and their ordering not changing in future - # Python versions, and that we won't change the parsing logic for - # e.g. the operators, so it's best to be explicit in all the - # branches below. - value = getattr(root, field) - if isinstance(value, ast.UnaryOp): - # Faster to directly parse the operand and skip the op. - ast_traverse(value.operand, col_names, stack, nodes) - op = python_cudf_operator_map[type(value.op)] - nodes.append(stack.pop()) - stack.append(Operation(op, nodes[-1])) - elif isinstance(value, ast.BinOp): - op = python_cudf_operator_map[type(value.op)] - - ast_traverse(value.left, col_names, stack, nodes) - ast_traverse(value.right, col_names, stack, nodes) - - nodes.append(stack.pop()) - nodes.append(stack.pop()) - stack.append(Operation(op, nodes[-1], nodes[-2])) - - # TODO: Whether And/Or and BitAnd/BitOr actually correspond to - # logical or bitwise operators depends on the data types that they - # are applied to. We'll need to add logic to map to that. - elif isinstance(value, (ast.BoolOp, ast.Compare)): - if isinstance(value, ast.BoolOp): - operators = repeat(value.op) - operands = zip(value.values[:-1], value.values[1:]) - multiple_ops = len(value.values) > 2 - else: - operators = value.ops - operands = (value.left, *value.comparators) - multiple_ops = len(operands) > 2 - operands = zip(operands[:-1], operands[1:]) - - inner_ops = [] - for op, (left, right) in zip(operators, operands): - # Note that this will lead to duplicate nodes, e.g. if - # the comparison is `a < b < c` that will be encoded as - # `a < b and b < c`. - ast_traverse(left, col_names, stack, nodes) - ast_traverse(right, col_names, stack, nodes) - - nodes.append(stack.pop()) - nodes.append(stack.pop()) - - op = python_cudf_operator_map[type(op)] - inner_ops.append(Operation(op, nodes[-1], nodes[-2])) - nodes.append(inner_ops[-1]) - - # If we have more than one comparator, we need to link them - # together with a bunch of LOGICAL_AND operators. - if multiple_ops: - op = ASTOperator.LOGICAL_AND - - def _combine_compare_ops(left, right): - nodes.append(Operation(op, left, right)) - return nodes[-1] - - functools.reduce(_combine_compare_ops, inner_ops) - - stack.append(nodes[-1]) - elif isinstance(value, ast.Call): - if len(value.args) != 1 or value.keywords: - raise ValueError( - f"Function {value.func} only accepts one " - "positional argument." - ) - try: - op = python_cudf_function_map[value.func.id] - except KeyError: - raise ValueError(f"Unsupported function {value.func}.") - # Assuming only unary functions are supported (checked above). - ast_traverse(value.args[0], col_names, stack, nodes) - - nodes.append(stack.pop()) - stack.append(Operation(op, nodes[-1])) - elif isinstance(value, list): - for item in value: - if isinstance(item, ast.AST): - ast_traverse(item, col_names, stack, nodes) - elif isinstance(value, ast.AST): - ast_traverse(value, col_names, stack, nodes) + ast_traverse(root.args[0], col_names, stack, nodes) + + nodes.append(stack.pop()) + stack.append(Operation(op, nodes[-1])) + elif isinstance(root, list): + for item in root: + if isinstance(item, ast.AST): + ast_traverse(item, col_names, stack, nodes) def evaluate_expression(object df, Expression expr): @@ -349,6 +330,6 @@ def make_and_evaluate_expression(df, expr): # nodes created (the owning ColumnReferences and Literals) remain in scope. stack = [] nodes = [] - ast_traverse(ast.parse(expr), df._column_names, stack, nodes) + ast_traverse(ast.parse(expr).body[0].value, df._column_names, stack, nodes) # At the end, all the stack contains is the expression to evaluate. return evaluate_expression(df, stack[-1]) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 65dd0564ef6..7b69a11f56a 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9285,17 +9285,22 @@ def df_eval(): ) +# Note that for now expressions do not automatically handle casting, so inputs +# need to be casted appropriately @pytest.mark.parametrize( "expr, dtype", [ ("a + b", int), + ("a / b", float), ("a * b", int), ("a > b", int), ("a > b > c", int), ("a > b < c", int), ("a & b", int), - # ("a & b | c", int), + ("a & b | c", int), ("sin(a)", float), + ("exp(sin(abs(a)))", float), + ("(a + b) - (c * d)", int), ], ) def test_dataframe_eval(df_eval, expr, dtype): From c0316f222c8f8b15ec3c0dc1d2bdf7478da901ad Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Apr 2022 13:43:08 -0700 Subject: [PATCH 46/69] Minor cleanup. --- python/cudf/cudf/_lib/ast.pyx | 13 +++++-------- python/cudf/cudf/_lib/cpp/ast.pxd | 6 +++--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 5bcb998408a..1cefc1727ba 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -30,7 +30,7 @@ class ASTOperator(Enum): PYMOD = libcudf_ast.ast_operator.PYMOD POW = libcudf_ast.ast_operator.POW EQUAL = libcudf_ast.ast_operator.EQUAL - # NULL_EQUAL = libcudf_ast.ast_operator.NULL_EQUAL + NULL_EQUAL = libcudf_ast.ast_operator.NULL_EQUAL NOT_EQUAL = libcudf_ast.ast_operator.NOT_EQUAL LESS = libcudf_ast.ast_operator.LESS GREATER = libcudf_ast.ast_operator.GREATER @@ -40,9 +40,9 @@ class ASTOperator(Enum): BITWISE_OR = libcudf_ast.ast_operator.BITWISE_OR BITWISE_XOR = libcudf_ast.ast_operator.BITWISE_XOR LOGICAL_AND = libcudf_ast.ast_operator.LOGICAL_AND - # NULL_LOGICAL_AND = libcudf_ast.ast_operator.NULL_LOGICAL_AND + NULL_LOGICAL_AND = libcudf_ast.ast_operator.NULL_LOGICAL_AND LOGICAL_OR = libcudf_ast.ast_operator.LOGICAL_OR - # NULL_LOGICAL_OR = libcudf_ast.ast_operator.NULL_LOGICAL_OR + NULL_LOGICAL_OR = libcudf_ast.ast_operator.NULL_LOGICAL_OR # Unary operators IDENTITY = libcudf_ast.ast_operator.IDENTITY SIN = libcudf_ast.ast_operator.SIN @@ -113,16 +113,13 @@ cdef class Operation(Expression): # This dictionary encodes the mapping from Python AST operators to their cudf # counterparts. python_cudf_operator_map = { - # TODO: Mapping TBD for commented out operators. # Binary operators ast.Add: ASTOperator.ADD, ast.Sub: ASTOperator.SUB, ast.Mult: ASTOperator.MUL, ast.Div: ASTOperator.DIV, - # ast.True: ASTOperator.TRUE_DIV, ast.FloorDiv: ASTOperator.FLOOR_DIV, ast.Mod: ASTOperator.PYMOD, - # ast.Pymod: ASTOperator.PYMOD, ast.Pow: ASTOperator.POW, ast.Eq: ASTOperator.EQUAL, ast.NotEq: ASTOperator.NOT_EQUAL, @@ -175,7 +172,7 @@ python_cudf_function_map = { 'sqrt': ASTOperator.SQRT, 'abs': ASTOperator.ABS, - # TODO: Operators supported by libcudf with no Python analog. + # TODO: Operators supported by libcudf with no Python function analog. # ast.rint: ASTOperator.RINT, # ast.cbrt: ASTOperator.CBRT, # ast.ceil: ASTOperator.CEIL, @@ -278,7 +275,7 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): nodes.append(inner_ops[-1]) # If we have more than one comparator, we need to link them - # together with a bunch of LOGICAL_AND operators. + # together with LOGICAL_AND operators. if multiple_ops: op = ASTOperator.LOGICAL_AND diff --git a/python/cudf/cudf/_lib/cpp/ast.pxd b/python/cudf/cudf/_lib/cpp/ast.pxd index 426bbba6be4..70472c5dca2 100644 --- a/python/cudf/cudf/_lib/cpp/ast.pxd +++ b/python/cudf/cudf/_lib/cpp/ast.pxd @@ -25,7 +25,7 @@ cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil: PYMOD "cudf::ast::ast_operator::PYMOD" POW "cudf::ast::ast_operator::POW" EQUAL "cudf::ast::ast_operator::EQUAL" - # NULL_EQUAL "cudf::ast::ast_operator::NULL_EQUAL" + NULL_EQUAL "cudf::ast::ast_operator::NULL_EQUAL" NOT_EQUAL "cudf::ast::ast_operator::NOT_EQUAL" LESS "cudf::ast::ast_operator::LESS" GREATER "cudf::ast::ast_operator::GREATER" @@ -34,9 +34,9 @@ cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil: BITWISE_AND "cudf::ast::ast_operator::BITWISE_AND" BITWISE_OR "cudf::ast::ast_operator::BITWISE_OR" BITWISE_XOR "cudf::ast::ast_operator::BITWISE_XOR" - # NULL_LOGICAL_AND "cudf::ast::ast_operator::NULL_LOGICAL_AND" + NULL_LOGICAL_AND "cudf::ast::ast_operator::NULL_LOGICAL_AND" LOGICAL_AND "cudf::ast::ast_operator::LOGICAL_AND" - # NULL_LOGICAL_OR "cudf::ast::ast_operator::NULL_LOGICAL_OR" + NULL_LOGICAL_OR "cudf::ast::ast_operator::NULL_LOGICAL_OR" LOGICAL_OR "cudf::ast::ast_operator::LOGICAL_OR" # Unary operators IDENTITY "cudf::ast::ast_operator::IDENTITY" From 8c2da11b230fed610f2ef6399084584e5b461fc8 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Apr 2022 14:42:32 -0700 Subject: [PATCH 47/69] Improve some code style and formatting. --- python/cudf/cudf/_lib/ast.pyx | 36 ++++++++++++++----------- python/cudf/cudf/_lib/cpp/ast.pxd | 6 ----- python/cudf/cudf/_lib/cpp/transform.pxd | 8 +++++- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 1cefc1727ba..444013adfc7 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -12,12 +12,15 @@ from libcpp.utility cimport move from cudf._lib.ast cimport underlying_type_ast_operator from cudf._lib.column cimport Column -from cudf._lib.cpp cimport ast as libcudf_ast +from cudf._lib.cpp cimport ast as libcudf_ast, transform as libcudf_transform from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.utils cimport table_view_from_table +# Aliases for simplicity +ctypedef unique_ptr[libcudf_ast.expression] expression_ptr + class ASTOperator(Enum): ADD = libcudf_ast.ast_operator.ADD @@ -81,15 +84,16 @@ cdef class Literal(Expression): def __cinit__(self, value): cdef int val = value self.c_scalar = make_unique[numeric_scalar[int64_t]](val, True) - self.c_obj = make_unique[ - libcudf_ast.literal]( - dereference(self.c_scalar)) + self.c_obj = make_unique[libcudf_ast.literal]( + dereference(self.c_scalar) + ) cdef class ColumnReference(Expression): def __cinit__(self, size_type index): - self.c_obj = make_unique[ - libcudf_ast.column_reference](index) + self.c_obj = make_unique[libcudf_ast.column_reference]( + index + ) cdef class Operation(Expression): @@ -97,16 +101,17 @@ cdef class Operation(Expression): # This awkward double casting is the only way to get Cython to generate # valid C++ that doesn't try to apply the shift operator directly to # values of the enum (which is invalid). - cdef libcudf_ast.ast_operator op_value = ( - op.value) + cdef libcudf_ast.ast_operator op_value = ( + op.value + ) if right is None: - self.c_obj = make_unique[ - libcudf_ast.operation](op_value, dereference(left.c_obj)) + self.c_obj = make_unique[libcudf_ast.operation]( + op_value, dereference(left.c_obj) + ) else: - self.c_obj = make_unique[ - libcudf_ast.operation]( - op_value, dereference(left.c_obj), dereference(right.c_obj) + self.c_obj = make_unique[libcudf_ast.operation]( + op_value, dereference(left.c_obj), dereference(right.c_obj) ) @@ -303,8 +308,7 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): stack.append(Operation(op, nodes[-1])) elif isinstance(root, list): for item in root: - if isinstance(item, ast.AST): - ast_traverse(item, col_names, stack, nodes) + ast_traverse(item, col_names, stack, nodes) def evaluate_expression(object df, Expression expr): @@ -313,7 +317,7 @@ def evaluate_expression(object df, Expression expr): cdef table_view tbl = table_view_from_table(df) with nogil: col = move( - libcudf_ast.compute_column( + libcudf_transform.compute_column( tbl, dereference(expr.c_obj.get()) ) diff --git a/python/cudf/cudf/_lib/cpp/ast.pxd b/python/cudf/cudf/_lib/cpp/ast.pxd index 70472c5dca2..a7700551754 100644 --- a/python/cudf/cudf/_lib/cpp/ast.pxd +++ b/python/cudf/cudf/_lib/cpp/ast.pxd @@ -86,9 +86,3 @@ cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil: cdef cppclass operation(expression): operation(ast_operator, const expression &) operation(ast_operator, const expression&, const expression&) - -cdef extern from "cudf/transform.hpp" namespace "cudf" nogil: - cdef unique_ptr[column] compute_column( - const table_view table, - const expression &expr - ) except + diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd index 590a371ff52..a37ae346dbf 100644 --- a/python/cudf/cudf/_lib/cpp/transform.pxd +++ b/python/cudf/cudf/_lib/cpp/transform.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -7,6 +7,7 @@ from libcpp.string cimport string from rmm._lib.device_buffer cimport device_buffer +from cudf._lib.cpp.ast cimport expression from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.table.table cimport table @@ -42,3 +43,8 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil: column_view input_column, column_view categories ) + + cdef unique_ptr[column] compute_column( + const table_view table, + const expression &expr + ) except + From 8fcca8df4b9e7b6d4c088d3d12aa0748ba52452e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Apr 2022 16:05:02 -0700 Subject: [PATCH 48/69] Enable unary ops and do some cleanup. --- python/cudf/cudf/_lib/ast.pyx | 59 +++++++++++------------- python/cudf/cudf/core/dataframe.py | 4 +- python/cudf/cudf/tests/test_dataframe.py | 4 ++ 3 files changed, 33 insertions(+), 34 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 444013adfc7..5f2b60762be 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -3,7 +3,6 @@ import ast import functools from enum import Enum -from itertools import repeat from cython.operator cimport dereference from libc.stdint cimport int64_t @@ -147,9 +146,8 @@ python_cudf_operator_map = { ast.Or: ASTOperator.LOGICAL_OR, # Unary operators - # ast.Identity: ASTOperator.IDENTITY, - # ast.Bit: ASTOperator.BIT_INVERT, - # ast.Not: ASTOperator.NOT, + ast.Invert: ASTOperator.BIT_INVERT, + ast.Not: ASTOperator.NOT, } @@ -187,7 +185,7 @@ python_cudf_function_map = { # TODO: Consider wrapping this in another function that caches the result along # with the nodes so that we can avoid multiple parsings. -cdef ast_traverse(root, tuple col_names, list stack, list nodes): +cdef parse_expression(root, tuple col_names, list stack, list nodes): """Construct an evaluable libcudf expression by traversing Python AST. This function performs a recursive traversal of the provided root @@ -210,15 +208,15 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): created. When parsing the current root requires creating an Operation node, a suitable number of elements (corresponding to the arity of the operator) are popped from the stack as the operands for the operation. - When the recursive traversal is complete, the stack will have length - exactly one and contain the expression to evaluate. + When the recursive traversal is complete, the stack will contain + exactly one element, the expression to evaluate. nodes : list The set of all nodes created while parsing the expression. This argument is necessary because all C++ node types are non-owning objects, so if the Python Expressions corresponding to nodes in the expression go out of scope and are garbage-collected the final expression will contain references to invalid data and seg fault upon - evaluation. This list must remain in scope until the expression has + evaluation. This list must remain in scope until the expression has been evaluated. """ # TODO: We'll eventually need to find a way to support operations on mixed @@ -236,15 +234,15 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): stack.append(Literal(root.n)) elif isinstance(root, ast.UnaryOp): # Faster to directly parse the operand and skip the op. - ast_traverse(root.operand, col_names, stack, nodes) + parse_expression(root.operand, col_names, stack, nodes) op = python_cudf_operator_map[type(root.op)] nodes.append(stack.pop()) stack.append(Operation(op, nodes[-1])) elif isinstance(root, ast.BinOp): op = python_cudf_operator_map[type(root.op)] - ast_traverse(root.left, col_names, stack, nodes) - ast_traverse(root.right, col_names, stack, nodes) + parse_expression(root.left, col_names, stack, nodes) + parse_expression(root.right, col_names, stack, nodes) nodes.append(stack.pop()) nodes.append(stack.pop()) @@ -255,7 +253,7 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): # are applied to. We'll need to add logic to map to that. elif isinstance(root, (ast.BoolOp, ast.Compare)): if isinstance(root, ast.BoolOp): - operators = repeat(root.op) + operators = [root.op] * (len(root.values) - 1) operands = zip(root.values[:-1], root.values[1:]) multiple_ops = len(root.values) > 2 else: @@ -269,8 +267,8 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): # Note that this will lead to duplicate nodes, e.g. if # the comparison is `a < b < c` that will be encoded as # `a < b and b < c`. - ast_traverse(left, col_names, stack, nodes) - ast_traverse(right, col_names, stack, nodes) + parse_expression(left, col_names, stack, nodes) + parse_expression(right, col_names, stack, nodes) nodes.append(stack.pop()) nodes.append(stack.pop()) @@ -302,35 +300,34 @@ cdef ast_traverse(root, tuple col_names, list stack, list nodes): f"Function {root.func} only accepts one positional " "argument." ) - ast_traverse(root.args[0], col_names, stack, nodes) + parse_expression(root.args[0], col_names, stack, nodes) nodes.append(stack.pop()) stack.append(Operation(op, nodes[-1])) elif isinstance(root, list): for item in root: - ast_traverse(item, col_names, stack, nodes) + parse_expression(item, col_names, stack, nodes) -def evaluate_expression(object df, Expression expr): - """Evaluate an Expression on a DataFrame.""" - cdef unique_ptr[column] col +def evaluate_expression(df, expr): + """Create a cudf evaluable expression from a string and evaluate it.""" + # Important: both make and evaluate must be coupled to guarantee that the + # nodes created (the owning ColumnReferences and Literals) remain in scope. + stack = [] + nodes = [] + parse_expression( + ast.parse(expr).body[0].value, df._column_names, stack, nodes + ) + + # At the end, all the stack contains is the expression to evaluate. + cdef Expression cudf_expr = stack[-1] cdef table_view tbl = table_view_from_table(df) + cdef unique_ptr[column] col with nogil: col = move( libcudf_transform.compute_column( tbl, - dereference(expr.c_obj.get()) + dereference(cudf_expr.c_obj.get()) ) ) return {None: Column.from_unique_ptr(move(col))} - - -def make_and_evaluate_expression(df, expr): - """Create a cudf evaluable expression from a string and evaluate it.""" - # Important: both make and evaluate must be coupled to guarantee that the - # nodes created (the owning ColumnReferences and Literals) remain in scope. - stack = [] - nodes = [] - ast_traverse(ast.parse(expr).body[0].value, df._column_names, stack, nodes) - # At the end, all the stack contains is the expression to evaluate. - return evaluate_expression(df, stack[-1]) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3fd3ebf5463..be4c0a3367e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6311,9 +6311,7 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): 3 4 4 4 5 2 """ - return Series._from_data( - libcudf.ast.make_and_evaluate_expression(self, expr) - ) + return Series._from_data(libcudf.ast.evaluate_expression(self, expr)) def from_dataframe(df, allow_copy=False): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 7b69a11f56a..37987bd83b4 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9301,6 +9301,10 @@ def df_eval(): ("sin(a)", float), ("exp(sin(abs(a)))", float), ("(a + b) - (c * d)", int), + ("~a", int), + ("(a > b) and (c > d)", int), + ("(a > b) or (c > d)", int), + ("not (a > b)", int), ], ) def test_dataframe_eval(df_eval, expr, dtype): From 76505b611388037f5dd1b6792ce78c3aa713af51 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Apr 2022 16:31:37 -0700 Subject: [PATCH 49/69] Some attempts to streamline the function flow. --- python/cudf/cudf/_lib/ast.pyx | 21 +++++++++++++-------- python/cudf/cudf/core/dataframe.py | 3 +++ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 5f2b60762be..760a9adcf54 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -230,22 +230,26 @@ cdef parse_expression(root, tuple col_names, list stack, list nodes): stack.append(ColumnReference(col_id)) # Note: in Python > 3.7 ast.Num is a subclass of ast.Constant. We may need # to generalize this code eventually if that inheritance is removed. - elif isinstance(root, ast.Num): - stack.append(Literal(root.n)) + if isinstance(root, ast.Constant): + if isinstance(root, ast.Num): + stack.append(Literal(root.n)) + else: + raise ValueError( + f"Unsupported literal {repr(root.value)} of type " + "{type(root.value)}" + ) elif isinstance(root, ast.UnaryOp): - # Faster to directly parse the operand and skip the op. parse_expression(root.operand, col_names, stack, nodes) - op = python_cudf_operator_map[type(root.op)] nodes.append(stack.pop()) + op = python_cudf_operator_map[type(root.op)] stack.append(Operation(op, nodes[-1])) elif isinstance(root, ast.BinOp): - op = python_cudf_operator_map[type(root.op)] - parse_expression(root.left, col_names, stack, nodes) parse_expression(root.right, col_names, stack, nodes) - nodes.append(stack.pop()) nodes.append(stack.pop()) + + op = python_cudf_operator_map[type(root.op)] stack.append(Operation(op, nodes[-1], nodes[-2])) # TODO: Whether And/Or and BitAnd/BitOr actually correspond to @@ -275,7 +279,8 @@ cdef parse_expression(root, tuple col_names, list stack, list nodes): op = python_cudf_operator_map[type(op)] inner_ops.append(Operation(op, nodes[-1], nodes[-2])) - nodes.append(inner_ops[-1]) + + nodes.extend(inner_ops) # If we have more than one comparator, we need to link them # together with LOGICAL_AND operators. diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index be4c0a3367e..145a55da9b2 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6274,6 +6274,9 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): booleans. * String columns are not yet supported. * Only numerical literals are supported. + * Operators generally will not cast automatically. Users are + responsible for casting columns to suitable types before + evaluating a function. Examples -------- From 63711497413ee9c0898d33382307b6f7cd74a80a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Apr 2022 17:06:21 -0700 Subject: [PATCH 50/69] Enable caching properly. --- python/cudf/cudf/_lib/ast.pyx | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 760a9adcf54..2f75a2cc99d 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -3,6 +3,7 @@ import ast import functools from enum import Enum +from typing import List from cython.operator cimport dereference from libc.stdint cimport int64_t @@ -183,8 +184,6 @@ python_cudf_function_map = { } -# TODO: Consider wrapping this in another function that caches the result along -# with the nodes so that we can avoid multiple parsings. cdef parse_expression(root, tuple col_names, list stack, list nodes): """Construct an evaluable libcudf expression by traversing Python AST. @@ -314,18 +313,35 @@ cdef parse_expression(root, tuple col_names, list stack, list nodes): parse_expression(item, col_names, stack, nodes) -def evaluate_expression(df, expr): - """Create a cudf evaluable expression from a string and evaluate it.""" - # Important: both make and evaluate must be coupled to guarantee that the - # nodes created (the owning ColumnReferences and Literals) remain in scope. +# TODO: It would be nice to use a dataclass for this, but Cython won't support +# it until we upgrade to 3.0. +class _OwningExpression: + """A container for an Expression that owns the Expression's subnodes.""" + def __init__(self, expression: Expression, nodes: List[Expression]): + self.expression = expression + self.nodes = nodes + + +@functools.lru_cache(256) +def parse_expression_cached(str expr, tuple col_names): + """A caching wrapper for parse_expression. + + The signature is chosen so as to appropriately determine the cache key. + """ stack = [] nodes = [] parse_expression( - ast.parse(expr).body[0].value, df._column_names, stack, nodes + ast.parse(expr).body[0].value, col_names, stack, nodes ) + return _OwningExpression(stack[-1], nodes) + + +def evaluate_expression(df: "cudf.DataFrame", expr: str): + """Create a cudf evaluable expression from a string and evaluate it.""" + expr_container = parse_expression_cached(expr, df._column_names) # At the end, all the stack contains is the expression to evaluate. - cdef Expression cudf_expr = stack[-1] + cdef Expression cudf_expr = expr_container.expression cdef table_view tbl = table_view_from_table(df) cdef unique_ptr[column] col with nogil: From 4c27f7039116709deb1de1d4c9130546810192bc Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 12 Apr 2022 14:10:31 -0700 Subject: [PATCH 51/69] Fix docstring. --- python/cudf/cudf/core/dataframe.py | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 145a55da9b2..5f8002824e0 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6277,10 +6277,12 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): * Operators generally will not cast automatically. Users are responsible for casting columns to suitable types before evaluating a function. + * We do not support multiple statements in the same expression. + * We do not support assignment Examples -------- - >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) + >>> df = cudf.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) >>> df A B 0 1 10 @@ -6295,24 +6297,6 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): 3 8 4 7 dtype: int64 - - Assignment is allowed though by default the original DataFrame is not - modified. - - >>> df.eval('C = A + B') - A B C - 0 1 10 11 - 1 2 8 10 - 2 3 6 9 - 3 4 4 8 - 4 5 2 7 - >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 """ return Series._from_data(libcudf.ast.evaluate_expression(self, expr)) From 954b99859fc28687a58db50c5fef0375b18f1610 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 12 Apr 2022 15:32:29 -0700 Subject: [PATCH 52/69] Test out approach using a NodeVisitor instead of a recursive function. --- python/cudf/cudf/_lib/ast.pyx | 125 +++++++++++++++++++++++++++++++++- 1 file changed, 124 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 2f75a2cc99d..3cbd171d814 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -82,6 +82,7 @@ class TableReference(Enum): # restrictive at the moment. cdef class Literal(Expression): def __cinit__(self, value): + # TODO: Enable floating point scalars. cdef int val = value self.c_scalar = make_unique[numeric_scalar[int64_t]](val, True) self.c_obj = make_unique[libcudf_ast.literal]( @@ -149,6 +150,7 @@ python_cudf_operator_map = { # Unary operators ast.Invert: ASTOperator.BIT_INVERT, ast.Not: ASTOperator.NOT, + # TODO: Missing USub, possibility other unary ops? } @@ -313,6 +315,100 @@ cdef parse_expression(root, tuple col_names, list stack, list nodes): parse_expression(item, col_names, stack, nodes) +class libcudfASTVisitor(ast.NodeVisitor): + def __init__(self, col_names): + self.stack = [] + self.nodes = [] + self.col_names = col_names + + def visit_Name(self, node): + try: + col_id = self.col_names.index(node.id) + 1 + except ValueError: + raise ValueError(f"Unknown column name {node.id}") + self.stack.append(ColumnReference(col_id)) + + def visit_Constant(self, node): + if not isinstance(node, ast.Num): + raise ValueError( + f"Unsupported literal {repr(node.value)} of type " + "{type(node.value)}" + ) + self.stack.append(Literal(node.value)) + + def visit_UnaryOp(self, node): + self.visit(node.operand) + self.nodes.append(self.stack.pop()) + op = python_cudf_operator_map[type(node.op)] + self.stack.append(Operation(op, self.nodes[-1])) + + def visit_BinOp(self, node): + self.visit(node.left) + self.visit(node.right) + self.nodes.append(self.stack.pop()) + self.nodes.append(self.stack.pop()) + + op = python_cudf_operator_map[type(node.op)] + self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2])) + + def _visit_BoolOp_Compare(self, operators, operands, has_multiple_ops): + inner_ops = [] + for op, (left, right) in zip(operators, operands): + # Note that this will lead to duplicate nodes, e.g. if + # the comparison is `a < b < c` that will be encoded as + # `a < b and b < c`. + self.visit(left) + self.visit(right) + + self.nodes.append(self.stack.pop()) + self.nodes.append(self.stack.pop()) + + op = python_cudf_operator_map[type(op)] + inner_ops.append(Operation(op, self.nodes[-1], self.nodes[-2])) + + self.nodes.extend(inner_ops) + + # If we have more than one comparator, we need to link them + # together with LOGICAL_AND operators. + if has_multiple_ops: + op = ASTOperator.LOGICAL_AND + + def _combine_compare_ops(left, right): + self.nodes.append(Operation(op, left, right)) + return self.nodes[-1] + + functools.reduce(_combine_compare_ops, inner_ops) + + self.stack.append(self.nodes[-1]) + + def visit_BoolOp(self, node): + operators = [node.op] * (len(node.values) - 1) + operands = zip(node.values[:-1], node.values[1:]) + self._visit_BoolOp_Compare(operators, operands, len(node.values) > 2) + + def visit_Compare(self, node): + operands = (node.left, *node.comparators) + has_multiple_ops = len(operands) > 2 + operands = zip(operands[:-1], operands[1:]) + self._visit_BoolOp_Compare(node.ops, operands, has_multiple_ops) + + def visit_Call(self, node): + try: + op = python_cudf_function_map[node.func.id] + except KeyError: + raise ValueError(f"Unsupported function {node.func}.") + # Assuming only unary functions are supported, which is checked above. + if len(node.args) != 1 or node.keywords: + raise ValueError( + f"Function {node.func} only accepts one positional " + "argument." + ) + self.visit(node.args[0]) + + self.nodes.append(self.stack.pop()) + self.stack.append(Operation(op, self.nodes[-1])) + + # TODO: It would be nice to use a dataclass for this, but Cython won't support # it until we upgrade to 3.0. class _OwningExpression: @@ -322,7 +418,6 @@ class _OwningExpression: self.nodes = nodes -@functools.lru_cache(256) def parse_expression_cached(str expr, tuple col_names): """A caching wrapper for parse_expression. @@ -336,6 +431,16 @@ def parse_expression_cached(str expr, tuple col_names): return _OwningExpression(stack[-1], nodes) +def parse_expression_new(str expr, tuple col_names): + visitor = libcudfASTVisitor(col_names) + visitor.visit(ast.parse(expr)) + # Note that we don't really need this wrapper if we go with the visitor + # approach since the visitor itself owns the stack/nodes, but we'll need + # some sort of wrapper to enable caching so I've just mimicked the other + # pattern for now to simplify testing/benchmarking. + return _OwningExpression(visitor.stack[-1], visitor.nodes) + + def evaluate_expression(df: "cudf.DataFrame", expr: str): """Create a cudf evaluable expression from a string and evaluate it.""" expr_container = parse_expression_cached(expr, df._column_names) @@ -352,3 +457,21 @@ def evaluate_expression(df: "cudf.DataFrame", expr: str): ) ) return {None: Column.from_unique_ptr(move(col))} + + +def evaluate_expression_new(df: "cudf.DataFrame", expr: str): + """Create a cudf evaluable expression from a string and evaluate it.""" + expr_container = parse_expression_new(expr, df._column_names) + + # At the end, all the stack contains is the expression to evaluate. + cdef Expression cudf_expr = expr_container.expression + cdef table_view tbl = table_view_from_table(df) + cdef unique_ptr[column] col + with nogil: + col = move( + libcudf_transform.compute_column( + tbl, + dereference(cudf_expr.c_obj.get()) + ) + ) + return {None: Column.from_unique_ptr(move(col))} From 5198aac81a74e5663c68e1b5eacd418933c55d73 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 13 Apr 2022 16:09:55 -0700 Subject: [PATCH 53/69] Remove recursive function in favor of visitor. --- python/cudf/cudf/_lib/ast.pyx | 209 ++++++---------------------------- 1 file changed, 33 insertions(+), 176 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 3cbd171d814..2933b42b131 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -3,7 +3,7 @@ import ast import functools from enum import Enum -from typing import List +from typing import Tuple from cython.operator cimport dereference from libc.stdint cimport int64_t @@ -186,141 +186,32 @@ python_cudf_function_map = { } -cdef parse_expression(root, tuple col_names, list stack, list nodes): - """Construct an evaluable libcudf expression by traversing Python AST. +class libcudfASTVisitor(ast.NodeVisitor): + """A NodeVisitor specialized for constructing a libcudf expression tree. - This function performs a recursive traversal of the provided root - node, constructing column references from names and literal values - from constants, then building up expressions. The final result is - the expression contained in the ``stack`` list once the function has - terminated: this list will always have length one at the end of parsing - a valid expression. + This visitor is designed to handle AST nodes that have libcudf equivalents. + It constructs column references from names and literals from constants, + then builds up operations. The final result can be accessed using the + `expression` property. The visitor must be kept in scope for as long as the + expression is needed because all of the underlying libcudf expressions will + be destroyed when the libcudfASTVisitor is. Parameters ---------- - root : ast.AST - An ast node generated by :py:func:`ast.parse`. - col_names : tuple - The column names in the data frame, which are used to generate indices - for column references to named columns in the expression. - stack : list - The current set of nodes to process. This list is empty on the initial - call to this function. New elements are added whenever new nodes are - created. When parsing the current root requires creating an Operation - node, a suitable number of elements (corresponding to the arity of the - operator) are popped from the stack as the operands for the operation. - When the recursive traversal is complete, the stack will contain - exactly one element, the expression to evaluate. - nodes : list - The set of all nodes created while parsing the expression. This - argument is necessary because all C++ node types are non-owning - objects, so if the Python Expressions corresponding to nodes in the - expression go out of scope and are garbage-collected the final - expression will contain references to invalid data and seg fault upon - evaluation. This list must remain in scope until the expression has - been evaluated. + col_names : Tuple[str] + The column names used to map the names in an expression. """ - # TODO: We'll eventually need to find a way to support operations on mixed - # but compatible dtypes (e.g. adding int to float). - # Base cases: Name - if isinstance(root, ast.Name): - try: - col_id = col_names.index(root.id) + 1 - except ValueError: - raise ValueError(f"Unknown column name {root.id}") - stack.append(ColumnReference(col_id)) - # Note: in Python > 3.7 ast.Num is a subclass of ast.Constant. We may need - # to generalize this code eventually if that inheritance is removed. - if isinstance(root, ast.Constant): - if isinstance(root, ast.Num): - stack.append(Literal(root.n)) - else: - raise ValueError( - f"Unsupported literal {repr(root.value)} of type " - "{type(root.value)}" - ) - elif isinstance(root, ast.UnaryOp): - parse_expression(root.operand, col_names, stack, nodes) - nodes.append(stack.pop()) - op = python_cudf_operator_map[type(root.op)] - stack.append(Operation(op, nodes[-1])) - elif isinstance(root, ast.BinOp): - parse_expression(root.left, col_names, stack, nodes) - parse_expression(root.right, col_names, stack, nodes) - nodes.append(stack.pop()) - nodes.append(stack.pop()) - - op = python_cudf_operator_map[type(root.op)] - stack.append(Operation(op, nodes[-1], nodes[-2])) - - # TODO: Whether And/Or and BitAnd/BitOr actually correspond to - # logical or bitwise operators depends on the data types that they - # are applied to. We'll need to add logic to map to that. - elif isinstance(root, (ast.BoolOp, ast.Compare)): - if isinstance(root, ast.BoolOp): - operators = [root.op] * (len(root.values) - 1) - operands = zip(root.values[:-1], root.values[1:]) - multiple_ops = len(root.values) > 2 - else: - operators = root.ops - operands = (root.left, *root.comparators) - multiple_ops = len(operands) > 2 - operands = zip(operands[:-1], operands[1:]) - - inner_ops = [] - for op, (left, right) in zip(operators, operands): - # Note that this will lead to duplicate nodes, e.g. if - # the comparison is `a < b < c` that will be encoded as - # `a < b and b < c`. - parse_expression(left, col_names, stack, nodes) - parse_expression(right, col_names, stack, nodes) - - nodes.append(stack.pop()) - nodes.append(stack.pop()) - - op = python_cudf_operator_map[type(op)] - inner_ops.append(Operation(op, nodes[-1], nodes[-2])) - - nodes.extend(inner_ops) - - # If we have more than one comparator, we need to link them - # together with LOGICAL_AND operators. - if multiple_ops: - op = ASTOperator.LOGICAL_AND - - def _combine_compare_ops(left, right): - nodes.append(Operation(op, left, right)) - return nodes[-1] - - functools.reduce(_combine_compare_ops, inner_ops) - - stack.append(nodes[-1]) - elif isinstance(root, ast.Call): - try: - op = python_cudf_function_map[root.func.id] - except KeyError: - raise ValueError(f"Unsupported function {root.func}.") - # Assuming only unary functions are supported, which is checked above. - if len(root.args) != 1 or root.keywords: - raise ValueError( - f"Function {root.func} only accepts one positional " - "argument." - ) - parse_expression(root.args[0], col_names, stack, nodes) - - nodes.append(stack.pop()) - stack.append(Operation(op, nodes[-1])) - elif isinstance(root, list): - for item in root: - parse_expression(item, col_names, stack, nodes) - - -class libcudfASTVisitor(ast.NodeVisitor): - def __init__(self, col_names): + def __init__(self, col_names: Tuple[str]): self.stack = [] self.nodes = [] self.col_names = col_names + @property + def expression(self): + """Expression: The result of parsing an AST.""" + assert len(self.stack) == 1 + return self.stack[-1] + def visit_Name(self, node): try: col_id = self.col_names.index(node.id) + 1 @@ -352,11 +243,20 @@ class libcudfASTVisitor(ast.NodeVisitor): self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2])) def _visit_BoolOp_Compare(self, operators, operands, has_multiple_ops): + # Helper function handling the common components of parsing BoolOp and + # Compare AST nodes. These two types of nodes both support chaining + # (e.g. `a > b > c` is equivalent to `a > b and b > c`, so this + # function helps standardize that. + + # TODO: Whether And/Or and BitAnd/BitOr actually correspond to + # logical or bitwise operators depends on the data types that they + # are applied to. We'll need to add logic to map to that. inner_ops = [] for op, (left, right) in zip(operators, operands): # Note that this will lead to duplicate nodes, e.g. if # the comparison is `a < b < c` that will be encoded as - # `a < b and b < c`. + # `a < b and b < c`. We could potentially optimize by caching + # expressions by name so that we only construct them once. self.visit(left) self.visit(right) @@ -409,62 +309,19 @@ class libcudfASTVisitor(ast.NodeVisitor): self.stack.append(Operation(op, self.nodes[-1])) -# TODO: It would be nice to use a dataclass for this, but Cython won't support -# it until we upgrade to 3.0. -class _OwningExpression: - """A container for an Expression that owns the Expression's subnodes.""" - def __init__(self, expression: Expression, nodes: List[Expression]): - self.expression = expression - self.nodes = nodes - - -def parse_expression_cached(str expr, tuple col_names): - """A caching wrapper for parse_expression. - - The signature is chosen so as to appropriately determine the cache key. - """ - stack = [] - nodes = [] - parse_expression( - ast.parse(expr).body[0].value, col_names, stack, nodes - ) - return _OwningExpression(stack[-1], nodes) - - -def parse_expression_new(str expr, tuple col_names): +@functools.lru_cache(256) +def parse_expression(str expr, tuple col_names): visitor = libcudfASTVisitor(col_names) visitor.visit(ast.parse(expr)) - # Note that we don't really need this wrapper if we go with the visitor - # approach since the visitor itself owns the stack/nodes, but we'll need - # some sort of wrapper to enable caching so I've just mimicked the other - # pattern for now to simplify testing/benchmarking. - return _OwningExpression(visitor.stack[-1], visitor.nodes) + return visitor def evaluate_expression(df: "cudf.DataFrame", expr: str): """Create a cudf evaluable expression from a string and evaluate it.""" - expr_container = parse_expression_cached(expr, df._column_names) - - # At the end, all the stack contains is the expression to evaluate. - cdef Expression cudf_expr = expr_container.expression - cdef table_view tbl = table_view_from_table(df) - cdef unique_ptr[column] col - with nogil: - col = move( - libcudf_transform.compute_column( - tbl, - dereference(cudf_expr.c_obj.get()) - ) - ) - return {None: Column.from_unique_ptr(move(col))} - - -def evaluate_expression_new(df: "cudf.DataFrame", expr: str): - """Create a cudf evaluable expression from a string and evaluate it.""" - expr_container = parse_expression_new(expr, df._column_names) + visitor = parse_expression(expr, df._column_names) # At the end, all the stack contains is the expression to evaluate. - cdef Expression cudf_expr = expr_container.expression + cdef Expression cudf_expr = visitor.expression cdef table_view tbl = table_view_from_table(df) cdef unique_ptr[column] col with nogil: From 3f3885f21e7e986bf212b7bf992ff9e13ce99fe4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 13 Apr 2022 17:30:12 -0700 Subject: [PATCH 54/69] Employ a union to differentiate between different scalar types for literals. --- python/cudf/cudf/_lib/ast.pxd | 13 ++++++++++- python/cudf/cudf/_lib/ast.pyx | 28 +++++++++++++++++++----- python/cudf/cudf/tests/test_dataframe.py | 2 ++ 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pxd b/python/cudf/cudf/_lib/ast.pxd index e71245d957d..1ce1ee05cc6 100644 --- a/python/cudf/cudf/_lib/ast.pxd +++ b/python/cudf/cudf/_lib/ast.pxd @@ -11,12 +11,23 @@ from cudf._lib.cpp.scalar.scalar cimport numeric_scalar ctypedef int32_t underlying_type_ast_operator +ctypedef enum scalar_type_t: + INT + DOUBLE + + +ctypedef union int_or_double_scalar_ptr: + unique_ptr[numeric_scalar[int64_t]] int_ptr + unique_ptr[numeric_scalar[double]] double_ptr + + cdef class Expression: cdef unique_ptr[expression] c_obj cdef class Literal(Expression): - cdef unique_ptr[numeric_scalar[int64_t]] c_scalar + cdef scalar_type_t c_scalar_type + cdef int_or_double_scalar_ptr c_scalar cdef class ColumnReference(Expression): diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 2933b42b131..e9dd5542fb0 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -82,12 +82,28 @@ class TableReference(Enum): # restrictive at the moment. cdef class Literal(Expression): def __cinit__(self, value): - # TODO: Enable floating point scalars. - cdef int val = value - self.c_scalar = make_unique[numeric_scalar[int64_t]](val, True) - self.c_obj = make_unique[libcudf_ast.literal]( - dereference(self.c_scalar) - ) + # TODO: Would love to find a better solution than unions for literals. + cdef int intval + cdef float floatval + + if isinstance(value, int): + self.c_scalar_type = scalar_type_t.INT + intval = value + self.c_scalar.int_ptr = make_unique[numeric_scalar[int64_t]]( + intval, True + ) + self.c_obj = make_unique[libcudf_ast.literal]( + dereference(self.c_scalar.int_ptr) + ) + elif isinstance(value, float): + self.c_scalar_type = scalar_type_t.DOUBLE + floatval = value + self.c_scalar.double_ptr = make_unique[numeric_scalar[double]]( + floatval, True + ) + self.c_obj = make_unique[libcudf_ast.literal]( + dereference(self.c_scalar.double_ptr) + ) cdef class ColumnReference(Expression): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 37987bd83b4..2964bc7d8a6 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9305,6 +9305,8 @@ def df_eval(): ("(a > b) and (c > d)", int), ("(a > b) or (c > d)", int), ("not (a > b)", int), + ("a + 1", int), + ("a + 1.0", float), ], ) def test_dataframe_eval(df_eval, expr, dtype): From ff08921384cea168e5af971d3f047acc1ad3979f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 13 Apr 2022 18:04:02 -0700 Subject: [PATCH 55/69] Partially enable USub and enable UAdd. --- python/cudf/cudf/_lib/ast.pyx | 15 +++++++++++++-- python/cudf/cudf/tests/test_dataframe.py | 4 ++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index e9dd5542fb0..5000fb10491 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -246,8 +246,19 @@ class libcudfASTVisitor(ast.NodeVisitor): def visit_UnaryOp(self, node): self.visit(node.operand) self.nodes.append(self.stack.pop()) - op = python_cudf_operator_map[type(node.op)] - self.stack.append(Operation(op, self.nodes[-1])) + if isinstance(node.op, ast.USub): + # TODO: Except for leaf nodes, we won't know the type of the + # operand, so there's no way to know whether this should be a float + # or an int. We should maybe see what Spark does, and this will + # probably require casting. + self.nodes.append(Literal(-1)) + op = ASTOperator.MUL + self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2])) + elif isinstance(node.op, ast.UAdd): + self.stack.append(self.nodes[-1]) + else: + op = python_cudf_operator_map[type(node.op)] + self.stack.append(Operation(op, self.nodes[-1])) def visit_BinOp(self, node): self.visit(node.left) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 2964bc7d8a6..502950b5425 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9290,6 +9290,8 @@ def df_eval(): @pytest.mark.parametrize( "expr, dtype", [ + ("a", int), + ("+a", int), ("a + b", int), ("a / b", float), ("a * b", int), @@ -9307,6 +9309,8 @@ def df_eval(): ("not (a > b)", int), ("a + 1", int), ("a + 1.0", float), + ("-a + 1", int), + ("+a + 1", int), ], ) def test_dataframe_eval(df_eval, expr, dtype): From 279c916e0b57a8fb346a28e4a9c59f6f93d0bb05 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Apr 2022 14:26:49 -0700 Subject: [PATCH 56/69] Enable assignment and inplace operations. --- python/cudf/cudf/_lib/ast.pyx | 13 +++- python/cudf/cudf/core/dataframe.py | 79 ++++++++++++++++++++++-- python/cudf/cudf/tests/test_dataframe.py | 15 +++++ 3 files changed, 99 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 5000fb10491..e836c4d0b36 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -344,7 +344,16 @@ def parse_expression(str expr, tuple col_names): def evaluate_expression(df: "cudf.DataFrame", expr: str): - """Create a cudf evaluable expression from a string and evaluate it.""" + """Create a cudf evaluable expression from a string and evaluate it. + + Parameters + ---------- + df : cudf.DataFrame + The DataFrame that we are applying functions to. + expr : str + The expression to evaluate. Must be a single expression (not a + statement, i.e. no assignment). + """ visitor = parse_expression(expr, df._column_names) # At the end, all the stack contains is the expression to evaluate. @@ -358,4 +367,4 @@ def evaluate_expression(df: "cudf.DataFrame", expr: str): dereference(cudf_expr.c_obj.get()) ) ) - return {None: Column.from_unique_ptr(move(col))} + return Column.from_unique_ptr(move(col)) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 5f8002824e0..63bdab5d0cd 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6267,18 +6267,15 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): Notes ----- Difference from pandas: - * The inplace argument or additional kwargs are not supported. + * Additional kwargs are not supported. * Bitwise and logical operators are not dtype-dependent. Specifically, `&` must be used for bitwise operators on integers, not `and`, which is specifically for the logical and between booleans. - * String columns are not yet supported. - * Only numerical literals are supported. + * Only numerical types are currently supported. * Operators generally will not cast automatically. Users are responsible for casting columns to suitable types before evaluating a function. - * We do not support multiple statements in the same expression. - * We do not support assignment Examples -------- @@ -6297,8 +6294,78 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): 3 8 4 7 dtype: int64 + + Assignment is allowed though by default the original DataFrame is not + modified. + >>> df.eval('C = A + B') + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + + Use ``inplace=True`` to modify the original DataFrame. + + >>> df.eval('C = A + B', inplace=True) + >>> df + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + + Multiple columns can be assigned to using multi-line expressions: + + >>> df.eval( + ... ''' + ... C = A + B + ... D = A - B + ... ''' + ... ) + A B C D + 0 1 10 11 -9 + 1 2 8 10 -6 + 2 3 6 9 -3 + 3 4 4 8 0 + 4 5 2 7 3 """ - return Series._from_data(libcudf.ast.evaluate_expression(self, expr)) + includes_assignment = "=" in expr + # Check if there were multiple statements. Filter out empty lines. + exprs = tuple(filter(None, expr.strip().split("\n"))) + if len(exprs) > 1 and not all("=" in e for e in exprs): + raise ValueError( + "Multi-line expressions are only valid if all expressions " + "contain an assignment." + ) + + if includes_assignment: + targets, exprs = zip( + *((s.strip() for s in e.split("=")) for e in exprs) + ) + + outputs = (libcudf.ast.evaluate_expression(self, e) for e in exprs) + ret = self if inplace else self.copy(deep=False) + for name, col in zip(targets, outputs): + ret._data[name] = col + if not inplace: + return ret + else: + if inplace: + raise ValueError( + "Cannot operate inplace if there is no assignment" + ) + return Series._from_data( + {None: libcudf.ast.evaluate_expression(self, exprs[0])} + ) def from_dataframe(df, allow_copy=False): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 502950b5425..3cf1a5db9dc 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9311,6 +9311,14 @@ def df_eval(): ("a + 1.0", float), ("-a + 1", int), ("+a + 1", int), + ("e = a + 1", int), + ( + """ + e = log(cos(a)) + 1.0 + f = abs(c) - exp(d) + """, + float, + ), ], ) def test_dataframe_eval(df_eval, expr, dtype): @@ -9321,3 +9329,10 @@ def test_dataframe_eval(df_eval, expr, dtype): # of a single column with no nesting, pandas will retain the name. This # level of compatibility is out of scope for now. assert_eq(expect, got, check_names=False) + + # Test inplace + if "=" in expr: + pdf_eval = df_eval.to_pandas() + pdf_eval.eval(expr, inplace=True) + df_eval.eval(expr, inplace=True) + assert_eq(pdf_eval, df_eval) From 237c54fe0b3123d8a007c334bb271a2068ffbe0a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Apr 2022 14:36:39 -0700 Subject: [PATCH 57/69] Move ast.evaluate_expression to transform.compute_column to match C++ layout. --- python/cudf/cudf/_lib/ast.pyx | 27 ------------------------- python/cudf/cudf/_lib/transform.pyx | 31 +++++++++++++++++++++++++++++ python/cudf/cudf/core/dataframe.py | 6 +++--- 3 files changed, 34 insertions(+), 30 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index e836c4d0b36..418b00fa7a2 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -341,30 +341,3 @@ def parse_expression(str expr, tuple col_names): visitor = libcudfASTVisitor(col_names) visitor.visit(ast.parse(expr)) return visitor - - -def evaluate_expression(df: "cudf.DataFrame", expr: str): - """Create a cudf evaluable expression from a string and evaluate it. - - Parameters - ---------- - df : cudf.DataFrame - The DataFrame that we are applying functions to. - expr : str - The expression to evaluate. Must be a single expression (not a - statement, i.e. no assignment). - """ - visitor = parse_expression(expr, df._column_names) - - # At the end, all the stack contains is the expression to evaluate. - cdef Expression cudf_expr = visitor.expression - cdef table_view tbl = table_view_from_table(df) - cdef unique_ptr[column] col - with nogil: - col = move( - libcudf_transform.compute_column( - tbl, - dereference(cudf_expr.c_obj.get()) - ) - ) - return Column.from_unique_ptr(move(col)) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 175150b6865..9a811b1e7af 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -4,10 +4,12 @@ import numpy as np from numba.np import numpy_support import cudf +from cudf._lib.ast import parse_expression from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES from cudf.core.buffer import Buffer from cudf.utils import cudautils +from cython.operator cimport dereference from libc.stdint cimport uintptr_t from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair @@ -17,7 +19,9 @@ from libcpp.utility cimport move from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer cimport cudf._lib.cpp.transform as libcudf_transform +from cudf._lib.ast cimport Expression from cudf._lib.column cimport Column +from cudf._lib.cpp.ast cimport expression from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.table.table cimport table @@ -156,3 +160,30 @@ def one_hot_encode(Column input_column, Column categories): ) return encodings + + +def compute_column(df: "cudf.DataFrame", expr: str): + """Compute a new column by evaluating an expression on a set of columns. + + Parameters + ---------- + df : cudf.DataFrame + The DataFrame that we are applying functions to. + expr : str + The expression to evaluate. Must be a single expression (not a + statement, i.e. no assignment). + """ + visitor = parse_expression(expr, df._column_names) + + # At the end, all the stack contains is the expression to evaluate. + cdef Expression cudf_expr = visitor.expression + cdef table_view tbl = table_view_from_table(df) + cdef unique_ptr[column] col + with nogil: + col = move( + libcudf_transform.compute_column( + tbl, + dereference(cudf_expr.c_obj.get()) + ) + ) + return Column.from_unique_ptr(move(col)) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 63bdab5d0cd..d9cc9f61553 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6352,9 +6352,9 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): *((s.strip() for s in e.split("=")) for e in exprs) ) - outputs = (libcudf.ast.evaluate_expression(self, e) for e in exprs) + cols = (libcudf.transform.compute_column(self, e) for e in exprs) ret = self if inplace else self.copy(deep=False) - for name, col in zip(targets, outputs): + for name, col in zip(targets, cols): ret._data[name] = col if not inplace: return ret @@ -6364,7 +6364,7 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): "Cannot operate inplace if there is no assignment" ) return Series._from_data( - {None: libcudf.ast.evaluate_expression(self, exprs[0])} + {None: libcudf.transform.compute_column(self, exprs[0])} ) From 9f64040fa3d64d55921b2f04101e390dfb6a1cdb Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Apr 2022 14:58:57 -0700 Subject: [PATCH 58/69] Switch to using a list of columns for the Cython API. --- python/cudf/cudf/_lib/ast.pyx | 2 +- python/cudf/cudf/_lib/transform.pyx | 8 +++++--- python/cudf/cudf/core/dataframe.py | 13 +++++++++++-- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 418b00fa7a2..721a1787f52 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -230,7 +230,7 @@ class libcudfASTVisitor(ast.NodeVisitor): def visit_Name(self, node): try: - col_id = self.col_names.index(node.id) + 1 + col_id = self.col_names.index(node.id) except ValueError: raise ValueError(f"Unknown column name {node.id}") self.stack.append(ColumnReference(col_id)) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 9a811b1e7af..20880fb4b4b 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -31,7 +31,9 @@ from cudf._lib.types cimport underlying_type_t_type_id from cudf._lib.utils cimport ( columns_from_unique_ptr, data_from_table_view, + data_from_unique_ptr, table_view_from_columns, + table_view_from_table, ) @@ -162,7 +164,7 @@ def one_hot_encode(Column input_column, Column categories): return encodings -def compute_column(df: "cudf.DataFrame", expr: str): +def compute_column(list columns, tuple column_names, expr: str): """Compute a new column by evaluating an expression on a set of columns. Parameters @@ -173,11 +175,11 @@ def compute_column(df: "cudf.DataFrame", expr: str): The expression to evaluate. Must be a single expression (not a statement, i.e. no assignment). """ - visitor = parse_expression(expr, df._column_names) + visitor = parse_expression(expr, column_names) # At the end, all the stack contains is the expression to evaluate. cdef Expression cudf_expr = visitor.expression - cdef table_view tbl = table_view_from_table(df) + cdef table_view tbl = table_view_from_columns(columns) cdef unique_ptr[column] col with nogil: col = move( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d9cc9f61553..10afea03319 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6352,7 +6352,12 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): *((s.strip() for s in e.split("=")) for e in exprs) ) - cols = (libcudf.transform.compute_column(self, e) for e in exprs) + cols = ( + libcudf.transform.compute_column( + [*self._columns], self._column_names, e + ) + for e in exprs + ) ret = self if inplace else self.copy(deep=False) for name, col in zip(targets, cols): ret._data[name] = col @@ -6364,7 +6369,11 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): "Cannot operate inplace if there is no assignment" ) return Series._from_data( - {None: libcudf.transform.compute_column(self, exprs[0])} + { + None: libcudf.transform.compute_column( + [*self._columns], self._column_names, exprs[0] + ) + } ) From 7a228d24d4000cbba4ff1e408c5451d4096ccf55 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Apr 2022 15:57:30 -0700 Subject: [PATCH 59/69] Some minor streamlining of the logic. --- python/cudf/cudf/core/dataframe.py | 35 +++++++++++++++--------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 10afea03319..90275d4c6f2 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6339,6 +6339,7 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): 4 5 2 7 3 """ includes_assignment = "=" in expr + # Check if there were multiple statements. Filter out empty lines. exprs = tuple(filter(None, expr.strip().split("\n"))) if len(exprs) > 1 and not all("=" in e for e in exprs): @@ -6347,23 +6348,7 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): "contain an assignment." ) - if includes_assignment: - targets, exprs = zip( - *((s.strip() for s in e.split("=")) for e in exprs) - ) - - cols = ( - libcudf.transform.compute_column( - [*self._columns], self._column_names, e - ) - for e in exprs - ) - ret = self if inplace else self.copy(deep=False) - for name, col in zip(targets, cols): - ret._data[name] = col - if not inplace: - return ret - else: + if not includes_assignment: if inplace: raise ValueError( "Cannot operate inplace if there is no assignment" @@ -6376,6 +6361,22 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): } ) + targets, exprs = zip( + *((s.strip() for s in e.split("=")) for e in exprs) + ) + + cols = ( + libcudf.transform.compute_column( + [*self._columns], self._column_names, e + ) + for e in exprs + ) + ret = self if inplace else self.copy(deep=False) + for name, col in zip(targets, cols): + ret._data[name] = col + if not inplace: + return ret + def from_dataframe(df, allow_copy=False): return df_protocol.from_dataframe(df, allow_copy=allow_copy) From aedd3f19fa7602de9f53765db4b5adcd86f79e99 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Apr 2022 16:00:05 -0700 Subject: [PATCH 60/69] Fix comment. --- python/cudf/cudf/_lib/ast.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/ast.pxd b/python/cudf/cudf/_lib/ast.pxd index 1ce1ee05cc6..5ade82f24f1 100644 --- a/python/cudf/cudf/_lib/ast.pxd +++ b/python/cudf/cudf/_lib/ast.pxd @@ -7,7 +7,7 @@ from cudf._lib.cpp.ast cimport column_reference, expression, literal, operation from cudf._lib.cpp.scalar.scalar cimport numeric_scalar # Since Cython <3 doesn't support scoped enumerations but attempts to work with -# the underlying value of an enum, typedefing this to cast seems unavoidable. +# the underlying value of an enum, we will need this ctypedef for casting. ctypedef int32_t underlying_type_ast_operator From 8ba26bc8bbafc4a7a37823f37438aa36914284dd Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Apr 2022 16:07:34 -0700 Subject: [PATCH 61/69] Simplify casting and improve explanation. --- python/cudf/cudf/_lib/ast.pxd | 5 ----- python/cudf/cudf/_lib/ast.pyx | 11 ++++++++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/ast.pxd b/python/cudf/cudf/_lib/ast.pxd index 5ade82f24f1..e17d5dec19d 100644 --- a/python/cudf/cudf/_lib/ast.pxd +++ b/python/cudf/cudf/_lib/ast.pxd @@ -6,11 +6,6 @@ from libcpp.memory cimport unique_ptr from cudf._lib.cpp.ast cimport column_reference, expression, literal, operation from cudf._lib.cpp.scalar.scalar cimport numeric_scalar -# Since Cython <3 doesn't support scoped enumerations but attempts to work with -# the underlying value of an enum, we will need this ctypedef for casting. -ctypedef int32_t underlying_type_ast_operator - - ctypedef enum scalar_type_t: INT DOUBLE diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/ast.pyx index 721a1787f52..4f91101e3be 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/ast.pyx @@ -10,7 +10,6 @@ from libc.stdint cimport int64_t from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move -from cudf._lib.ast cimport underlying_type_ast_operator from cudf._lib.column cimport Column from cudf._lib.cpp cimport ast as libcudf_ast, transform as libcudf_transform from cudf._lib.cpp.column.column cimport column @@ -18,6 +17,10 @@ from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.utils cimport table_view_from_table +# Necessary for proper casting, see below. +ctypedef int32_t underlying_type_ast_operator + + # Aliases for simplicity ctypedef unique_ptr[libcudf_ast.expression] expression_ptr @@ -116,8 +119,10 @@ cdef class ColumnReference(Expression): cdef class Operation(Expression): def __cinit__(self, op, Expression left, Expression right=None): # This awkward double casting is the only way to get Cython to generate - # valid C++ that doesn't try to apply the shift operator directly to - # values of the enum (which is invalid). + # valid C++. Cython doesn't support scoped enumerations, so it assumes + # that enums correspond to their underlying value types and will thus + # attempt operations that are invalid without first explicitly casting + # to the underlying before casting to the desired type. cdef libcudf_ast.ast_operator op_value = ( op.value ) From 6c1543e3ecadf0eb6d6b10784a5b9bbf05e383ba Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 19 Apr 2022 14:25:48 -0700 Subject: [PATCH 62/69] Fix missing parenthesis. --- python/cudf/cudf/core/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 90275d4c6f2..0159ce33503 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6237,6 +6237,7 @@ def interleave_columns(self): return self._constructor_sliced._from_data( {None: libcudf.reshape.interleave_columns([*self._columns])} + ) @_cudf_nvtx_annotate def eval(self, expr: "str", inplace: "bool" = False, **kwargs): From 31241bee526949da891c71c3e072c3f4ef6b0752 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 21 Apr 2022 11:31:36 -0700 Subject: [PATCH 63/69] Switch from a simple 'in' check to using a regex to avoid treating == as assignment. --- python/cudf/cudf/core/dataframe.py | 4 +++- python/cudf/cudf/tests/test_dataframe.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0159ce33503..9b3ae888cf0 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7,6 +7,7 @@ import itertools import numbers import pickle +import re import sys import warnings from collections import abc, defaultdict @@ -6339,7 +6340,8 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): 3 4 4 8 0 4 5 2 7 3 """ - includes_assignment = "=" in expr + # Have to use a regex match to avoid capturing "==" + includes_assignment = re.search("[^=]=[^=]", expr) is not None # Check if there were multiple statements. Filter out empty lines. exprs = tuple(filter(None, expr.strip().split("\n"))) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 3cf1a5db9dc..0358ef918fa 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9293,6 +9293,7 @@ def df_eval(): ("a", int), ("+a", int), ("a + b", int), + ("a == b", int), ("a / b", float), ("a * b", int), ("a > b", int), @@ -9331,7 +9332,7 @@ def test_dataframe_eval(df_eval, expr, dtype): assert_eq(expect, got, check_names=False) # Test inplace - if "=" in expr: + if re.search("[^=]=[^=]", expr) is not None: pdf_eval = df_eval.to_pandas() pdf_eval.eval(expr, inplace=True) df_eval.eval(expr, inplace=True) From 2623d0a96e2747af18207d94ea6169fb66a1a490 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 26 Apr 2022 07:13:29 -0700 Subject: [PATCH 64/69] Move all 'ast' files to 'expressions' to reflect C++ namespaces accurately. --- .../_lib/cpp/{ast.pxd => expressions.pxd} | 0 python/cudf/cudf/_lib/cpp/transform.pxd | 2 +- .../cudf/_lib/{ast.pxd => expressions.pxd} | 7 +- .../cudf/_lib/{ast.pyx => expressions.pyx} | 117 +++++++++--------- python/cudf/cudf/_lib/transform.pyx | 6 +- 5 files changed, 70 insertions(+), 62 deletions(-) rename python/cudf/cudf/_lib/cpp/{ast.pxd => expressions.pxd} (100%) rename python/cudf/cudf/_lib/{ast.pxd => expressions.pxd} (85%) rename python/cudf/cudf/_lib/{ast.pyx => expressions.pyx} (79%) diff --git a/python/cudf/cudf/_lib/cpp/ast.pxd b/python/cudf/cudf/_lib/cpp/expressions.pxd similarity index 100% rename from python/cudf/cudf/_lib/cpp/ast.pxd rename to python/cudf/cudf/_lib/cpp/expressions.pxd diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd index a37ae346dbf..975595d7020 100644 --- a/python/cudf/cudf/_lib/cpp/transform.pxd +++ b/python/cudf/cudf/_lib/cpp/transform.pxd @@ -7,9 +7,9 @@ from libcpp.string cimport string from rmm._lib.device_buffer cimport device_buffer -from cudf._lib.cpp.ast cimport expression from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.expressions cimport expression from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type diff --git a/python/cudf/cudf/_lib/ast.pxd b/python/cudf/cudf/_lib/expressions.pxd similarity index 85% rename from python/cudf/cudf/_lib/ast.pxd rename to python/cudf/cudf/_lib/expressions.pxd index e17d5dec19d..85665822174 100644 --- a/python/cudf/cudf/_lib/ast.pxd +++ b/python/cudf/cudf/_lib/expressions.pxd @@ -3,7 +3,12 @@ from libc.stdint cimport int32_t, int64_t from libcpp.memory cimport unique_ptr -from cudf._lib.cpp.ast cimport column_reference, expression, literal, operation +from cudf._lib.cpp.expressions cimport ( + column_reference, + expression, + literal, + operation, +) from cudf._lib.cpp.scalar.scalar cimport numeric_scalar ctypedef enum scalar_type_t: diff --git a/python/cudf/cudf/_lib/ast.pyx b/python/cudf/cudf/_lib/expressions.pyx similarity index 79% rename from python/cudf/cudf/_lib/ast.pyx rename to python/cudf/cudf/_lib/expressions.pyx index 4f91101e3be..06c932612c8 100644 --- a/python/cudf/cudf/_lib/ast.pyx +++ b/python/cudf/cudf/_lib/expressions.pyx @@ -11,7 +11,10 @@ from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move from cudf._lib.column cimport Column -from cudf._lib.cpp cimport ast as libcudf_ast, transform as libcudf_transform +from cudf._lib.cpp cimport ( + expressions as libcudf_exp, + transform as libcudf_transform, +) from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type @@ -22,62 +25,62 @@ ctypedef int32_t underlying_type_ast_operator # Aliases for simplicity -ctypedef unique_ptr[libcudf_ast.expression] expression_ptr +ctypedef unique_ptr[libcudf_exp.expression] expression_ptr class ASTOperator(Enum): - ADD = libcudf_ast.ast_operator.ADD - SUB = libcudf_ast.ast_operator.SUB - MUL = libcudf_ast.ast_operator.MUL - DIV = libcudf_ast.ast_operator.DIV - TRUE_DIV = libcudf_ast.ast_operator.TRUE_DIV - FLOOR_DIV = libcudf_ast.ast_operator.FLOOR_DIV - MOD = libcudf_ast.ast_operator.MOD - PYMOD = libcudf_ast.ast_operator.PYMOD - POW = libcudf_ast.ast_operator.POW - EQUAL = libcudf_ast.ast_operator.EQUAL - NULL_EQUAL = libcudf_ast.ast_operator.NULL_EQUAL - NOT_EQUAL = libcudf_ast.ast_operator.NOT_EQUAL - LESS = libcudf_ast.ast_operator.LESS - GREATER = libcudf_ast.ast_operator.GREATER - LESS_EQUAL = libcudf_ast.ast_operator.LESS_EQUAL - GREATER_EQUAL = libcudf_ast.ast_operator.GREATER_EQUAL - BITWISE_AND = libcudf_ast.ast_operator.BITWISE_AND - BITWISE_OR = libcudf_ast.ast_operator.BITWISE_OR - BITWISE_XOR = libcudf_ast.ast_operator.BITWISE_XOR - LOGICAL_AND = libcudf_ast.ast_operator.LOGICAL_AND - NULL_LOGICAL_AND = libcudf_ast.ast_operator.NULL_LOGICAL_AND - LOGICAL_OR = libcudf_ast.ast_operator.LOGICAL_OR - NULL_LOGICAL_OR = libcudf_ast.ast_operator.NULL_LOGICAL_OR + ADD = libcudf_exp.ast_operator.ADD + SUB = libcudf_exp.ast_operator.SUB + MUL = libcudf_exp.ast_operator.MUL + DIV = libcudf_exp.ast_operator.DIV + TRUE_DIV = libcudf_exp.ast_operator.TRUE_DIV + FLOOR_DIV = libcudf_exp.ast_operator.FLOOR_DIV + MOD = libcudf_exp.ast_operator.MOD + PYMOD = libcudf_exp.ast_operator.PYMOD + POW = libcudf_exp.ast_operator.POW + EQUAL = libcudf_exp.ast_operator.EQUAL + NULL_EQUAL = libcudf_exp.ast_operator.NULL_EQUAL + NOT_EQUAL = libcudf_exp.ast_operator.NOT_EQUAL + LESS = libcudf_exp.ast_operator.LESS + GREATER = libcudf_exp.ast_operator.GREATER + LESS_EQUAL = libcudf_exp.ast_operator.LESS_EQUAL + GREATER_EQUAL = libcudf_exp.ast_operator.GREATER_EQUAL + BITWISE_AND = libcudf_exp.ast_operator.BITWISE_AND + BITWISE_OR = libcudf_exp.ast_operator.BITWISE_OR + BITWISE_XOR = libcudf_exp.ast_operator.BITWISE_XOR + LOGICAL_AND = libcudf_exp.ast_operator.LOGICAL_AND + NULL_LOGICAL_AND = libcudf_exp.ast_operator.NULL_LOGICAL_AND + LOGICAL_OR = libcudf_exp.ast_operator.LOGICAL_OR + NULL_LOGICAL_OR = libcudf_exp.ast_operator.NULL_LOGICAL_OR # Unary operators - IDENTITY = libcudf_ast.ast_operator.IDENTITY - SIN = libcudf_ast.ast_operator.SIN - COS = libcudf_ast.ast_operator.COS - TAN = libcudf_ast.ast_operator.TAN - ARCSIN = libcudf_ast.ast_operator.ARCSIN - ARCCOS = libcudf_ast.ast_operator.ARCCOS - ARCTAN = libcudf_ast.ast_operator.ARCTAN - SINH = libcudf_ast.ast_operator.SINH - COSH = libcudf_ast.ast_operator.COSH - TANH = libcudf_ast.ast_operator.TANH - ARCSINH = libcudf_ast.ast_operator.ARCSINH - ARCCOSH = libcudf_ast.ast_operator.ARCCOSH - ARCTANH = libcudf_ast.ast_operator.ARCTANH - EXP = libcudf_ast.ast_operator.EXP - LOG = libcudf_ast.ast_operator.LOG - SQRT = libcudf_ast.ast_operator.SQRT - CBRT = libcudf_ast.ast_operator.CBRT - CEIL = libcudf_ast.ast_operator.CEIL - FLOOR = libcudf_ast.ast_operator.FLOOR - ABS = libcudf_ast.ast_operator.ABS - RINT = libcudf_ast.ast_operator.RINT - BIT_INVERT = libcudf_ast.ast_operator.BIT_INVERT - NOT = libcudf_ast.ast_operator.NOT + IDENTITY = libcudf_exp.ast_operator.IDENTITY + SIN = libcudf_exp.ast_operator.SIN + COS = libcudf_exp.ast_operator.COS + TAN = libcudf_exp.ast_operator.TAN + ARCSIN = libcudf_exp.ast_operator.ARCSIN + ARCCOS = libcudf_exp.ast_operator.ARCCOS + ARCTAN = libcudf_exp.ast_operator.ARCTAN + SINH = libcudf_exp.ast_operator.SINH + COSH = libcudf_exp.ast_operator.COSH + TANH = libcudf_exp.ast_operator.TANH + ARCSINH = libcudf_exp.ast_operator.ARCSINH + ARCCOSH = libcudf_exp.ast_operator.ARCCOSH + ARCTANH = libcudf_exp.ast_operator.ARCTANH + EXP = libcudf_exp.ast_operator.EXP + LOG = libcudf_exp.ast_operator.LOG + SQRT = libcudf_exp.ast_operator.SQRT + CBRT = libcudf_exp.ast_operator.CBRT + CEIL = libcudf_exp.ast_operator.CEIL + FLOOR = libcudf_exp.ast_operator.FLOOR + ABS = libcudf_exp.ast_operator.ABS + RINT = libcudf_exp.ast_operator.RINT + BIT_INVERT = libcudf_exp.ast_operator.BIT_INVERT + NOT = libcudf_exp.ast_operator.NOT class TableReference(Enum): - LEFT = libcudf_ast.table_reference.LEFT - RIGHT = libcudf_ast.table_reference.RIGHT + LEFT = libcudf_exp.table_reference.LEFT + RIGHT = libcudf_exp.table_reference.RIGHT # Note that this function only currently supports numeric literals. libcudf @@ -95,7 +98,7 @@ cdef class Literal(Expression): self.c_scalar.int_ptr = make_unique[numeric_scalar[int64_t]]( intval, True ) - self.c_obj = make_unique[libcudf_ast.literal]( + self.c_obj = make_unique[libcudf_exp.literal]( dereference(self.c_scalar.int_ptr) ) elif isinstance(value, float): @@ -104,14 +107,14 @@ cdef class Literal(Expression): self.c_scalar.double_ptr = make_unique[numeric_scalar[double]]( floatval, True ) - self.c_obj = make_unique[libcudf_ast.literal]( + self.c_obj = make_unique[libcudf_exp.literal]( dereference(self.c_scalar.double_ptr) ) cdef class ColumnReference(Expression): def __cinit__(self, size_type index): - self.c_obj = make_unique[libcudf_ast.column_reference]( + self.c_obj = make_unique[libcudf_exp.column_reference]( index ) @@ -123,16 +126,16 @@ cdef class Operation(Expression): # that enums correspond to their underlying value types and will thus # attempt operations that are invalid without first explicitly casting # to the underlying before casting to the desired type. - cdef libcudf_ast.ast_operator op_value = ( + cdef libcudf_exp.ast_operator op_value = ( op.value ) if right is None: - self.c_obj = make_unique[libcudf_ast.operation]( + self.c_obj = make_unique[libcudf_exp.operation]( op_value, dereference(left.c_obj) ) else: - self.c_obj = make_unique[libcudf_ast.operation]( + self.c_obj = make_unique[libcudf_exp.operation]( op_value, dereference(left.c_obj), dereference(right.c_obj) ) @@ -179,7 +182,7 @@ python_cudf_operator_map = { # corresponding libcudf C++ AST operators. python_cudf_function_map = { # TODO: Operators listed on - # https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#expression-evaluation-via-eval + # https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#expression-evaluation-via-eval # noqa: E501 # that we don't support yet: # expm1, log1p, arctan2 and log10. 'sin': ASTOperator.SIN, diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 20880fb4b4b..ef500a8e0ef 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -4,7 +4,7 @@ import numpy as np from numba.np import numpy_support import cudf -from cudf._lib.ast import parse_expression +from cudf._lib.expressions import parse_expression from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES from cudf.core.buffer import Buffer from cudf.utils import cudautils @@ -19,14 +19,14 @@ from libcpp.utility cimport move from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer cimport cudf._lib.cpp.transform as libcudf_transform -from cudf._lib.ast cimport Expression from cudf._lib.column cimport Column -from cudf._lib.cpp.ast cimport expression from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.expressions cimport expression from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type, type_id +from cudf._lib.expressions cimport Expression from cudf._lib.types cimport underlying_type_t_type_id from cudf._lib.utils cimport ( columns_from_unique_ptr, From 9b0d4e4b808f1bdf1f3ef9aba1cee83c4e81f430 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 26 Apr 2022 07:35:26 -0700 Subject: [PATCH 65/69] Address some PR comments. --- python/cudf/cudf/_lib/expressions.pyx | 6 ++--- python/cudf/cudf/_lib/transform.pyx | 12 ++++++---- python/cudf/cudf/core/dataframe.py | 28 ++++++++++++++---------- python/cudf/cudf/tests/test_dataframe.py | 9 ++++++++ 4 files changed, 37 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx index 06c932612c8..aedd5877571 100644 --- a/python/cudf/cudf/_lib/expressions.pyx +++ b/python/cudf/cudf/_lib/expressions.pyx @@ -90,7 +90,7 @@ cdef class Literal(Expression): def __cinit__(self, value): # TODO: Would love to find a better solution than unions for literals. cdef int intval - cdef float floatval + cdef double doubleval if isinstance(value, int): self.c_scalar_type = scalar_type_t.INT @@ -103,9 +103,9 @@ cdef class Literal(Expression): ) elif isinstance(value, float): self.c_scalar_type = scalar_type_t.DOUBLE - floatval = value + doubleval = value self.c_scalar.double_ptr = make_unique[numeric_scalar[double]]( - floatval, True + doubleval, True ) self.c_obj = make_unique[libcudf_exp.literal]( dereference(self.c_scalar.double_ptr) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index ef500a8e0ef..feaf0425997 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -169,11 +169,15 @@ def compute_column(list columns, tuple column_names, expr: str): Parameters ---------- - df : cudf.DataFrame - The DataFrame that we are applying functions to. + columns : list + The set of columns forming the table to evaluate the expression on. + column_names : str + The names associated with each column. These names are necessary to map + column names in the expression to indices in the provided list of + columns, which are what will be used by libcudf to evaluate the + expression on the table. expr : str - The expression to evaluate. Must be a single expression (not a - statement, i.e. no assignment). + The expression to evaluate. """ visitor = parse_expression(expr, column_names) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9b3ae888cf0..3afb7a65a4a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6241,12 +6241,10 @@ def interleave_columns(self): ) @_cudf_nvtx_annotate - def eval(self, expr: "str", inplace: "bool" = False, **kwargs): + def eval(self, expr: str, inplace: bool = False, **kwargs): """Evaluate a string describing operations on DataFrame columns. - Operates on columns only, not specific rows or elements. This allows - `eval` to run arbitrary code, which can make you vulnerable to code - injection if you pass user input to this function. + Operates on columns only, not specific rows or elements. Parameters ---------- @@ -6257,14 +6255,14 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): operation inplace and mutate the existing DataFrame. Otherwise, a new DataFrame is returned. **kwargs - See the documentation for :func:`eval` for complete details - on the keyword arguments accepted by - :meth:`~pandas.DataFrame.query`. + Not supported. Returns ------- - ndarray, scalar, pandas object, or None - The result of the evaluation or None if ``inplace=True``. + DataFrame, Series, or None + Series if a single column is returned (the typical use case), + DataFrame if multiple assignment statements are included in + ``expr``, or None if ``inplace=True``. Notes ----- @@ -6283,7 +6281,7 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): -------- >>> df = cudf.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) >>> df - A B + A B 0 1 10 1 2 8 2 3 6 @@ -6299,6 +6297,7 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): Assignment is allowed though by default the original DataFrame is not modified. + >>> df.eval('C = A + B') A B C 0 1 10 11 @@ -6340,12 +6339,19 @@ def eval(self, expr: "str", inplace: "bool" = False, **kwargs): 3 4 4 8 0 4 5 2 7 3 """ + if kwargs: + raise ValueError( + "Keyword arguments other than `inplace` are not supported" + ) + # Have to use a regex match to avoid capturing "==" includes_assignment = re.search("[^=]=[^=]", expr) is not None # Check if there were multiple statements. Filter out empty lines. exprs = tuple(filter(None, expr.strip().split("\n"))) - if len(exprs) > 1 and not all("=" in e for e in exprs): + if len(exprs) > 1 and not all( + re.search("[^=]=[^=]", e) is not None for e in exprs + ): raise ValueError( "Multi-line expressions are only valid if all expressions " "contain an assignment." diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 0358ef918fa..1d757efd077 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9337,3 +9337,12 @@ def test_dataframe_eval(df_eval, expr, dtype): pdf_eval.eval(expr, inplace=True) df_eval.eval(expr, inplace=True) assert_eq(pdf_eval, df_eval) + + +def test_dataframe_eval_errors(df_eval): + expr = """ + e = a + b + a == b + """ + with pytest.raises(ValueError): + df_eval.eval(expr) From 4d6898eab1b27c250da6953c55aa7edbde71e079 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 26 Apr 2022 08:31:17 -0700 Subject: [PATCH 66/69] Address remaining PR comments aside from splitting out the visitor into a pure Python module. --- python/cudf/cudf/_lib/expressions.pyx | 4 ++-- python/cudf/cudf/core/dataframe.py | 24 +++++++++++++++++------- python/cudf/cudf/tests/test_dataframe.py | 19 ++++++++++++++----- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx index aedd5877571..127bc2e4ae8 100644 --- a/python/cudf/cudf/_lib/expressions.pyx +++ b/python/cudf/cudf/_lib/expressions.pyx @@ -201,12 +201,12 @@ python_cudf_function_map = { 'log': ASTOperator.LOG, 'sqrt': ASTOperator.SQRT, 'abs': ASTOperator.ABS, + 'ceil': ASTOperator.CEIL, + 'floor': ASTOperator.FLOOR, # TODO: Operators supported by libcudf with no Python function analog. # ast.rint: ASTOperator.RINT, # ast.cbrt: ASTOperator.CBRT, - # ast.ceil: ASTOperator.CEIL, - # ast.floor: ASTOperator.FLOOR, } diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3afb7a65a4a..ff6a7adec9e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6348,9 +6348,9 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): includes_assignment = re.search("[^=]=[^=]", expr) is not None # Check if there were multiple statements. Filter out empty lines. - exprs = tuple(filter(None, expr.strip().split("\n"))) - if len(exprs) > 1 and not all( - re.search("[^=]=[^=]", e) is not None for e in exprs + statements = tuple(filter(None, expr.strip().split("\n"))) + if len(statements) > 1 and not all( + re.search("[^=]=[^=]", e) is not None for e in statements ): raise ValueError( "Multi-line expressions are only valid if all expressions " @@ -6365,14 +6365,24 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): return Series._from_data( { None: libcudf.transform.compute_column( - [*self._columns], self._column_names, exprs[0] + [*self._columns], self._column_names, statements[0] ) } ) - targets, exprs = zip( - *((s.strip() for s in e.split("=")) for e in exprs) - ) + targets = [] + exprs = [] + for st in statements: + try: + t, e = re.split("[^=]=[^=]", st) + except ValueError as err: + if "too many values" in str(err): + raise ValueError( + f"Statement {st} contains too many assignments ('=')" + ) + raise + targets.append(t.strip()) + exprs.append(e.strip()) cols = ( libcudf.transform.compute_column( diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 1d757efd077..38e3f28b1e2 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9303,6 +9303,8 @@ def df_eval(): ("a & b | c", int), ("sin(a)", float), ("exp(sin(abs(a)))", float), + ("floor(sin(a))", float), + ("ceil(sin(a))", float), ("(a + b) - (c * d)", int), ("~a", int), ("(a > b) and (c > d)", int), @@ -9320,6 +9322,7 @@ def df_eval(): """, float, ), + ("a_b_are_equal = (a == b)", int), ], ) def test_dataframe_eval(df_eval, expr, dtype): @@ -9339,10 +9342,16 @@ def test_dataframe_eval(df_eval, expr, dtype): assert_eq(pdf_eval, df_eval) -def test_dataframe_eval_errors(df_eval): - expr = """ - e = a + b - a == b - """ +@pytest.mark.parametrize( + "expr", + [ + """ + e = a + b + a == b + """, + "a_b_are_equal = (a == b) = c", + ], +) +def test_dataframe_eval_errors(df_eval, expr): with pytest.raises(ValueError): df_eval.eval(expr) From d8f0ebcf28ff3fe9ee47e0b33fb0cd3547af2b81 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 26 Apr 2022 08:46:12 -0700 Subject: [PATCH 67/69] Move expression parsing logic into a separate pure Python module. --- python/cudf/cudf/_lib/expressions.pyx | 223 +----------------- python/cudf/cudf/_lib/transform.pyx | 2 +- .../cudf/cudf/core/_internals/expressions.py | 222 +++++++++++++++++ 3 files changed, 224 insertions(+), 223 deletions(-) create mode 100644 python/cudf/cudf/core/_internals/expressions.py diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx index 127bc2e4ae8..f069bcdbe73 100644 --- a/python/cudf/cudf/_lib/expressions.pyx +++ b/python/cudf/cudf/_lib/expressions.pyx @@ -1,24 +1,14 @@ # Copyright (c) 2022, NVIDIA CORPORATION. -import ast -import functools from enum import Enum -from typing import Tuple from cython.operator cimport dereference from libc.stdint cimport int64_t from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move -from cudf._lib.column cimport Column -from cudf._lib.cpp cimport ( - expressions as libcudf_exp, - transform as libcudf_transform, -) -from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.table.table_view cimport table_view +from cudf._lib.cpp cimport expressions as libcudf_exp from cudf._lib.cpp.types cimport size_type -from cudf._lib.utils cimport table_view_from_table # Necessary for proper casting, see below. ctypedef int32_t underlying_type_ast_operator @@ -138,214 +128,3 @@ cdef class Operation(Expression): self.c_obj = make_unique[libcudf_exp.operation]( op_value, dereference(left.c_obj), dereference(right.c_obj) ) - - -# This dictionary encodes the mapping from Python AST operators to their cudf -# counterparts. -python_cudf_operator_map = { - # Binary operators - ast.Add: ASTOperator.ADD, - ast.Sub: ASTOperator.SUB, - ast.Mult: ASTOperator.MUL, - ast.Div: ASTOperator.DIV, - ast.FloorDiv: ASTOperator.FLOOR_DIV, - ast.Mod: ASTOperator.PYMOD, - ast.Pow: ASTOperator.POW, - ast.Eq: ASTOperator.EQUAL, - ast.NotEq: ASTOperator.NOT_EQUAL, - ast.Lt: ASTOperator.LESS, - ast.Gt: ASTOperator.GREATER, - ast.LtE: ASTOperator.LESS_EQUAL, - ast.GtE: ASTOperator.GREATER_EQUAL, - ast.BitXor: ASTOperator.BITWISE_XOR, - # TODO: The mapping of logical/bitwise operators here is inconsistent with - # pandas. In pandas, Both `BitAnd` and `And` map to - # `ASTOperator.LOGICAL_AND` for booleans, while they map to - # `ASTOperator.BITWISE_AND` for integers. However, there is no good way to - # encode this at present because expressions can be arbitrarily nested so - # we won't know the dtype of the input without inserting a much more - # complex traversal of the expression tree to determine the output types at - # each node. For now, we'll rely on users to use the appropriate operator. - ast.BitAnd: ASTOperator.BITWISE_AND, - ast.BitOr: ASTOperator.BITWISE_OR, - ast.And: ASTOperator.LOGICAL_AND, - ast.Or: ASTOperator.LOGICAL_OR, - - # Unary operators - ast.Invert: ASTOperator.BIT_INVERT, - ast.Not: ASTOperator.NOT, - # TODO: Missing USub, possibility other unary ops? -} - - -# Mapping between Python function names encode in an ast.Call node and the -# corresponding libcudf C++ AST operators. -python_cudf_function_map = { - # TODO: Operators listed on - # https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#expression-evaluation-via-eval # noqa: E501 - # that we don't support yet: - # expm1, log1p, arctan2 and log10. - 'sin': ASTOperator.SIN, - 'cos': ASTOperator.COS, - 'tan': ASTOperator.TAN, - 'arcsin': ASTOperator.ARCSIN, - 'arccos': ASTOperator.ARCCOS, - 'arctan': ASTOperator.ARCTAN, - 'sinh': ASTOperator.SINH, - 'cosh': ASTOperator.COSH, - 'tanh': ASTOperator.TANH, - 'arcsinh': ASTOperator.ARCSINH, - 'arccosh': ASTOperator.ARCCOSH, - 'arctanh': ASTOperator.ARCTANH, - 'exp': ASTOperator.EXP, - 'log': ASTOperator.LOG, - 'sqrt': ASTOperator.SQRT, - 'abs': ASTOperator.ABS, - 'ceil': ASTOperator.CEIL, - 'floor': ASTOperator.FLOOR, - - # TODO: Operators supported by libcudf with no Python function analog. - # ast.rint: ASTOperator.RINT, - # ast.cbrt: ASTOperator.CBRT, -} - - -class libcudfASTVisitor(ast.NodeVisitor): - """A NodeVisitor specialized for constructing a libcudf expression tree. - - This visitor is designed to handle AST nodes that have libcudf equivalents. - It constructs column references from names and literals from constants, - then builds up operations. The final result can be accessed using the - `expression` property. The visitor must be kept in scope for as long as the - expression is needed because all of the underlying libcudf expressions will - be destroyed when the libcudfASTVisitor is. - - Parameters - ---------- - col_names : Tuple[str] - The column names used to map the names in an expression. - """ - def __init__(self, col_names: Tuple[str]): - self.stack = [] - self.nodes = [] - self.col_names = col_names - - @property - def expression(self): - """Expression: The result of parsing an AST.""" - assert len(self.stack) == 1 - return self.stack[-1] - - def visit_Name(self, node): - try: - col_id = self.col_names.index(node.id) - except ValueError: - raise ValueError(f"Unknown column name {node.id}") - self.stack.append(ColumnReference(col_id)) - - def visit_Constant(self, node): - if not isinstance(node, ast.Num): - raise ValueError( - f"Unsupported literal {repr(node.value)} of type " - "{type(node.value)}" - ) - self.stack.append(Literal(node.value)) - - def visit_UnaryOp(self, node): - self.visit(node.operand) - self.nodes.append(self.stack.pop()) - if isinstance(node.op, ast.USub): - # TODO: Except for leaf nodes, we won't know the type of the - # operand, so there's no way to know whether this should be a float - # or an int. We should maybe see what Spark does, and this will - # probably require casting. - self.nodes.append(Literal(-1)) - op = ASTOperator.MUL - self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2])) - elif isinstance(node.op, ast.UAdd): - self.stack.append(self.nodes[-1]) - else: - op = python_cudf_operator_map[type(node.op)] - self.stack.append(Operation(op, self.nodes[-1])) - - def visit_BinOp(self, node): - self.visit(node.left) - self.visit(node.right) - self.nodes.append(self.stack.pop()) - self.nodes.append(self.stack.pop()) - - op = python_cudf_operator_map[type(node.op)] - self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2])) - - def _visit_BoolOp_Compare(self, operators, operands, has_multiple_ops): - # Helper function handling the common components of parsing BoolOp and - # Compare AST nodes. These two types of nodes both support chaining - # (e.g. `a > b > c` is equivalent to `a > b and b > c`, so this - # function helps standardize that. - - # TODO: Whether And/Or and BitAnd/BitOr actually correspond to - # logical or bitwise operators depends on the data types that they - # are applied to. We'll need to add logic to map to that. - inner_ops = [] - for op, (left, right) in zip(operators, operands): - # Note that this will lead to duplicate nodes, e.g. if - # the comparison is `a < b < c` that will be encoded as - # `a < b and b < c`. We could potentially optimize by caching - # expressions by name so that we only construct them once. - self.visit(left) - self.visit(right) - - self.nodes.append(self.stack.pop()) - self.nodes.append(self.stack.pop()) - - op = python_cudf_operator_map[type(op)] - inner_ops.append(Operation(op, self.nodes[-1], self.nodes[-2])) - - self.nodes.extend(inner_ops) - - # If we have more than one comparator, we need to link them - # together with LOGICAL_AND operators. - if has_multiple_ops: - op = ASTOperator.LOGICAL_AND - - def _combine_compare_ops(left, right): - self.nodes.append(Operation(op, left, right)) - return self.nodes[-1] - - functools.reduce(_combine_compare_ops, inner_ops) - - self.stack.append(self.nodes[-1]) - - def visit_BoolOp(self, node): - operators = [node.op] * (len(node.values) - 1) - operands = zip(node.values[:-1], node.values[1:]) - self._visit_BoolOp_Compare(operators, operands, len(node.values) > 2) - - def visit_Compare(self, node): - operands = (node.left, *node.comparators) - has_multiple_ops = len(operands) > 2 - operands = zip(operands[:-1], operands[1:]) - self._visit_BoolOp_Compare(node.ops, operands, has_multiple_ops) - - def visit_Call(self, node): - try: - op = python_cudf_function_map[node.func.id] - except KeyError: - raise ValueError(f"Unsupported function {node.func}.") - # Assuming only unary functions are supported, which is checked above. - if len(node.args) != 1 or node.keywords: - raise ValueError( - f"Function {node.func} only accepts one positional " - "argument." - ) - self.visit(node.args[0]) - - self.nodes.append(self.stack.pop()) - self.stack.append(Operation(op, self.nodes[-1])) - - -@functools.lru_cache(256) -def parse_expression(str expr, tuple col_names): - visitor = libcudfASTVisitor(col_names) - visitor.visit(ast.parse(expr)) - return visitor diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index feaf0425997..668db6069eb 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -4,8 +4,8 @@ import numpy as np from numba.np import numpy_support import cudf -from cudf._lib.expressions import parse_expression from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES +from cudf.core._internals.expressions import parse_expression from cudf.core.buffer import Buffer from cudf.utils import cudautils diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py new file mode 100644 index 00000000000..935f4291955 --- /dev/null +++ b/python/cudf/cudf/core/_internals/expressions.py @@ -0,0 +1,222 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +import ast +import functools +from typing import List, Tuple + +from cudf._lib.expressions import ( + ASTOperator, + ColumnReference, + Expression, + Literal, + Operation, +) + +# This dictionary encodes the mapping from Python AST operators to their cudf +# counterparts. +python_cudf_operator_map = { + # Binary operators + ast.Add: ASTOperator.ADD, + ast.Sub: ASTOperator.SUB, + ast.Mult: ASTOperator.MUL, + ast.Div: ASTOperator.DIV, + ast.FloorDiv: ASTOperator.FLOOR_DIV, + ast.Mod: ASTOperator.PYMOD, + ast.Pow: ASTOperator.POW, + ast.Eq: ASTOperator.EQUAL, + ast.NotEq: ASTOperator.NOT_EQUAL, + ast.Lt: ASTOperator.LESS, + ast.Gt: ASTOperator.GREATER, + ast.LtE: ASTOperator.LESS_EQUAL, + ast.GtE: ASTOperator.GREATER_EQUAL, + ast.BitXor: ASTOperator.BITWISE_XOR, + # TODO: The mapping of logical/bitwise operators here is inconsistent with + # pandas. In pandas, Both `BitAnd` and `And` map to + # `ASTOperator.LOGICAL_AND` for booleans, while they map to + # `ASTOperator.BITWISE_AND` for integers. However, there is no good way to + # encode this at present because expressions can be arbitrarily nested so + # we won't know the dtype of the input without inserting a much more + # complex traversal of the expression tree to determine the output types at + # each node. For now, we'll rely on users to use the appropriate operator. + ast.BitAnd: ASTOperator.BITWISE_AND, + ast.BitOr: ASTOperator.BITWISE_OR, + ast.And: ASTOperator.LOGICAL_AND, + ast.Or: ASTOperator.LOGICAL_OR, + # Unary operators + ast.Invert: ASTOperator.BIT_INVERT, + ast.Not: ASTOperator.NOT, + # TODO: Missing USub, possibility other unary ops? +} + + +# Mapping between Python function names encode in an ast.Call node and the +# corresponding libcudf C++ AST operators. +python_cudf_function_map = { + # TODO: Operators listed on + # https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#expression-evaluation-via-eval # noqa: E501 + # that we don't support yet: + # expm1, log1p, arctan2 and log10. + "sin": ASTOperator.SIN, + "cos": ASTOperator.COS, + "tan": ASTOperator.TAN, + "arcsin": ASTOperator.ARCSIN, + "arccos": ASTOperator.ARCCOS, + "arctan": ASTOperator.ARCTAN, + "sinh": ASTOperator.SINH, + "cosh": ASTOperator.COSH, + "tanh": ASTOperator.TANH, + "arcsinh": ASTOperator.ARCSINH, + "arccosh": ASTOperator.ARCCOSH, + "arctanh": ASTOperator.ARCTANH, + "exp": ASTOperator.EXP, + "log": ASTOperator.LOG, + "sqrt": ASTOperator.SQRT, + "abs": ASTOperator.ABS, + "ceil": ASTOperator.CEIL, + "floor": ASTOperator.FLOOR, + # TODO: Operators supported by libcudf with no Python function analog. + # ast.rint: ASTOperator.RINT, + # ast.cbrt: ASTOperator.CBRT, +} + + +class libcudfASTVisitor(ast.NodeVisitor): + """A NodeVisitor specialized for constructing a libcudf expression tree. + + This visitor is designed to handle AST nodes that have libcudf equivalents. + It constructs column references from names and literals from constants, + then builds up operations. The final result can be accessed using the + `expression` property. The visitor must be kept in scope for as long as the + expression is needed because all of the underlying libcudf expressions will + be destroyed when the libcudfASTVisitor is. + + Parameters + ---------- + col_names : Tuple[str] + The column names used to map the names in an expression. + """ + + def __init__(self, col_names: Tuple[str]): + self.stack: List[Expression] = [] + self.nodes: List[Expression] = [] + self.col_names = col_names + + @property + def expression(self): + """Expression: The result of parsing an AST.""" + assert len(self.stack) == 1 + return self.stack[-1] + + def visit_Name(self, node): + try: + col_id = self.col_names.index(node.id) + except ValueError: + raise ValueError(f"Unknown column name {node.id}") + self.stack.append(ColumnReference(col_id)) + + def visit_Constant(self, node): + if not isinstance(node, ast.Num): + raise ValueError( + f"Unsupported literal {repr(node.value)} of type " + "{type(node.value)}" + ) + self.stack.append(Literal(node.value)) + + def visit_UnaryOp(self, node): + self.visit(node.operand) + self.nodes.append(self.stack.pop()) + if isinstance(node.op, ast.USub): + # TODO: Except for leaf nodes, we won't know the type of the + # operand, so there's no way to know whether this should be a float + # or an int. We should maybe see what Spark does, and this will + # probably require casting. + self.nodes.append(Literal(-1)) + op = ASTOperator.MUL + self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2])) + elif isinstance(node.op, ast.UAdd): + self.stack.append(self.nodes[-1]) + else: + op = python_cudf_operator_map[type(node.op)] + self.stack.append(Operation(op, self.nodes[-1])) + + def visit_BinOp(self, node): + self.visit(node.left) + self.visit(node.right) + self.nodes.append(self.stack.pop()) + self.nodes.append(self.stack.pop()) + + op = python_cudf_operator_map[type(node.op)] + self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2])) + + def _visit_BoolOp_Compare(self, operators, operands, has_multiple_ops): + # Helper function handling the common components of parsing BoolOp and + # Compare AST nodes. These two types of nodes both support chaining + # (e.g. `a > b > c` is equivalent to `a > b and b > c`, so this + # function helps standardize that. + + # TODO: Whether And/Or and BitAnd/BitOr actually correspond to + # logical or bitwise operators depends on the data types that they + # are applied to. We'll need to add logic to map to that. + inner_ops = [] + for op, (left, right) in zip(operators, operands): + # Note that this will lead to duplicate nodes, e.g. if + # the comparison is `a < b < c` that will be encoded as + # `a < b and b < c`. We could potentially optimize by caching + # expressions by name so that we only construct them once. + self.visit(left) + self.visit(right) + + self.nodes.append(self.stack.pop()) + self.nodes.append(self.stack.pop()) + + op = python_cudf_operator_map[type(op)] + inner_ops.append(Operation(op, self.nodes[-1], self.nodes[-2])) + + self.nodes.extend(inner_ops) + + # If we have more than one comparator, we need to link them + # together with LOGICAL_AND operators. + if has_multiple_ops: + op = ASTOperator.LOGICAL_AND + + def _combine_compare_ops(left, right): + self.nodes.append(Operation(op, left, right)) + return self.nodes[-1] + + functools.reduce(_combine_compare_ops, inner_ops) + + self.stack.append(self.nodes[-1]) + + def visit_BoolOp(self, node): + operators = [node.op] * (len(node.values) - 1) + operands = zip(node.values[:-1], node.values[1:]) + self._visit_BoolOp_Compare(operators, operands, len(node.values) > 2) + + def visit_Compare(self, node): + operands = (node.left, *node.comparators) + has_multiple_ops = len(operands) > 2 + operands = zip(operands[:-1], operands[1:]) + self._visit_BoolOp_Compare(node.ops, operands, has_multiple_ops) + + def visit_Call(self, node): + try: + op = python_cudf_function_map[node.func.id] + except KeyError: + raise ValueError(f"Unsupported function {node.func}.") + # Assuming only unary functions are supported, which is checked above. + if len(node.args) != 1 or node.keywords: + raise ValueError( + f"Function {node.func} only accepts one positional " + "argument." + ) + self.visit(node.args[0]) + + self.nodes.append(self.stack.pop()) + self.stack.append(Operation(op, self.nodes[-1])) + + +@functools.lru_cache(256) +def parse_expression(expr: str, col_names: Tuple[str]): + visitor = libcudfASTVisitor(col_names) + visitor.visit(ast.parse(expr)) + return visitor From 2c58b60675a2a7fc5b1ed98dba86f79f4775a48c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 27 Apr 2022 15:18:28 -0700 Subject: [PATCH 68/69] Address final PR comments. --- python/cudf/cudf/_lib/cpp/expressions.pxd | 2 +- python/cudf/cudf/_lib/cpp/transform.pxd | 2 +- python/cudf/cudf/_lib/transform.pyx | 2 +- python/cudf/cudf/core/_internals/expressions.py | 2 +- python/cudf/cudf/core/dataframe.py | 9 ++++++--- python/cudf/cudf/tests/test_dataframe.py | 4 ++-- 6 files changed, 12 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/expressions.pxd b/python/cudf/cudf/_lib/cpp/expressions.pxd index a7700551754..1721f8aa734 100644 --- a/python/cudf/cudf/_lib/cpp/expressions.pxd +++ b/python/cudf/cudf/_lib/cpp/expressions.pxd @@ -85,4 +85,4 @@ cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil: cdef cppclass operation(expression): operation(ast_operator, const expression &) - operation(ast_operator, const expression&, const expression&) + operation(ast_operator, const expression &, const expression&) diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd index 975595d7020..d9de04b676e 100644 --- a/python/cudf/cudf/_lib/cpp/transform.pxd +++ b/python/cudf/cudf/_lib/cpp/transform.pxd @@ -46,5 +46,5 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil: cdef unique_ptr[column] compute_column( const table_view table, - const expression &expr + const expression& expr ) except + diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 668db6069eb..2d94ef2cedf 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -171,7 +171,7 @@ def compute_column(list columns, tuple column_names, expr: str): ---------- columns : list The set of columns forming the table to evaluate the expression on. - column_names : str + column_names : tuple[str] The names associated with each column. These names are necessary to map column names in the expression to indices in the provided list of columns, which are what will be used by libcudf to evaluate the diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py index 935f4291955..bc587d4e1e2 100644 --- a/python/cudf/cudf/core/_internals/expressions.py +++ b/python/cudf/cudf/core/_internals/expressions.py @@ -118,7 +118,7 @@ def visit_Constant(self, node): if not isinstance(node, ast.Num): raise ValueError( f"Unsupported literal {repr(node.value)} of type " - "{type(node.value)}" + "{type(node.value).__name__}" ) self.stack.append(Literal(node.value)) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ff6a7adec9e..c02740dc4d6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6261,7 +6261,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): ------- DataFrame, Series, or None Series if a single column is returned (the typical use case), - DataFrame if multiple assignment statements are included in + DataFrame if any assignment statements are included in ``expr``, or None if ``inplace=True``. Notes @@ -6276,6 +6276,9 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): * Operators generally will not cast automatically. Users are responsible for casting columns to suitable types before evaluating a function. + * Multiple assignments to the same name (i.e. a sequence of + assignment statements where later statements are conditioned upon + the output of earlier statements) is not supported. Examples -------- @@ -6349,8 +6352,8 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): # Check if there were multiple statements. Filter out empty lines. statements = tuple(filter(None, expr.strip().split("\n"))) - if len(statements) > 1 and not all( - re.search("[^=]=[^=]", e) is not None for e in statements + if len(statements) > 1 and any( + re.search("[^=]=[^=]", st) is None for st in statements ): raise ValueError( "Multi-line expressions are only valid if all expressions " diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 38e3f28b1e2..bbb3cfabc98 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9303,8 +9303,8 @@ def df_eval(): ("a & b | c", int), ("sin(a)", float), ("exp(sin(abs(a)))", float), - ("floor(sin(a))", float), - ("ceil(sin(a))", float), + ("sqrt(floor(a))", float), + ("ceil(arctanh(a))", float), ("(a + b) - (c * d)", int), ("~a", int), ("(a > b) and (c > d)", int), From f009e5bd4eb51c34e536c4fa521c1d384f8997a6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 28 Apr 2022 06:40:01 -0700 Subject: [PATCH 69/69] Fix import issue hidden by locally persistent files. --- python/cudf/cudf/_lib/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 37f515a8f33..542262b7908 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -2,13 +2,13 @@ import numpy as np from . import ( - ast, avro, binaryop, concat, copying, csv, datetime, + expressions, filling, gpuarrow, groupby,