From 477155d0717e87c1fee7ebb4c61b2cb1413d2d89 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 12 Dec 2022 11:30:29 -0800 Subject: [PATCH 01/14] initially working up through count, work to do --- .../strings_udf/cpp/src/strings/udf/shim.cu | 11 ++++++ python/strings_udf/strings_udf/_typing.py | 16 ++++++++- python/strings_udf/strings_udf/lowering.py | 35 +++++++++++++------ .../strings_udf/tests/test_string_udfs.py | 33 +++++++++++++---- python/strings_udf/strings_udf/tests/utils.py | 26 ++++++++++++++ 5 files changed, 104 insertions(+), 17 deletions(-) create mode 100644 python/strings_udf/strings_udf/tests/utils.py diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu index d10cc635209..13815f1d9c9 100644 --- a/python/strings_udf/cpp/src/strings/udf/shim.cu +++ b/python/strings_udf/cpp/src/strings/udf/shim.cu @@ -231,6 +231,17 @@ extern "C" __device__ int udf_string_from_string_view(int* nb_retbal, return 0; } +extern "C" __device__ int string_view_from_udf_string(int* nb_retval, + void const* udf_str, + void* str) +{ + auto udf_str_ptr = reinterpret_cast(udf_str); + auto sv_ptr = new (str) cudf::string_view; + *sv_ptr = cudf::string_view(*udf_str_ptr); + + return 0; +} + extern "C" __device__ int strip(int* nb_retval, void* udf_str, void* const* to_strip, diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index 99e4046b0b3..1679780fe18 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -39,7 +39,6 @@ def return_type(self): class StringView(types.Type): - np_dtype = np.dtype("object") def __init__(self): @@ -218,6 +217,21 @@ def resolve_replace(self, mod): return types.BoundFunction(StringViewReplace, string_view) +class UDFStringCount(AbstractTemplate): + key = "UDFString.count" + + def generic(self, args, kws): + return nb_signature(size_type, string_view, recvr=self.this) + + +@cuda_decl_registry.register_attr +class UDFStringAttrs(AttributeTemplate): + key = udf_string + + def resolve_count(self, mod): + return types.BoundFunction(UDFStringCount, string_view) + + # Build attributes for `MaskedType(string_view)` bool_binary_funcs = ["startswith", "endswith"] int_binary_funcs = ["find", "rfind"] diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index 7294d06c05b..de1e07764b0 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -141,19 +141,21 @@ def cast_string_literal_to_string_view(context, builder, fromty, toty, val): return sv._getvalue() -@cuda_lowering_registry.lower_cast(string_view, udf_string) +@cuda_lowering_registry.lower_cast(udf_string, string_view) def cast_string_view_to_udf_string(context, builder, fromty, toty, val): - sv_ptr = builder.alloca(default_manager[fromty].get_value_type()) - udf_str_ptr = builder.alloca(default_manager[toty].get_value_type()) - builder.store(val, sv_ptr) + udf_str_ptr = builder.alloca(default_manager[fromty].get_value_type()) + sv_ptr = builder.alloca(default_manager[toty].get_value_type()) + builder.store(val, udf_str_ptr) + _ = context.compile_internal( builder, - call_create_udf_string_from_string_view, - nb_signature(types.void, _STR_VIEW_PTR, types.CPointer(udf_string)), - (sv_ptr, udf_str_ptr), + call_create_string_view_from_udf_string, + nb_signature(types.void, _UDF_STRING_PTR, _STR_VIEW_PTR), + (udf_str_ptr, sv_ptr), ) - result = cgutils.create_struct_proxy(udf_string)( - context, builder, value=builder.load(udf_str_ptr) + + result = cgutils.create_struct_proxy(string_view)( + context, builder, value=builder.load(sv_ptr) ) return result._getvalue() @@ -164,12 +166,20 @@ def cast_string_view_to_udf_string(context, builder, fromty, toty, val): "udf_string_from_string_view", types.void(types.CPointer(string_view), types.CPointer(udf_string)), ) +# utilities +_create_string_view_from_udf_string = cuda.declare_device( + "string_view_from_udf_string", types.void(_UDF_STRING_PTR, _STR_VIEW_PTR) +) def call_create_udf_string_from_string_view(sv, udf_str): _create_udf_string_from_string_view(sv, udf_str) +def call_create_string_view_from_udf_string(udf_str, sv): + _create_string_view_from_udf_string(udf_str, sv) + + # String function implementations def call_len_string_view(st): return _string_view_len(st) @@ -361,7 +371,12 @@ def endswith_impl(sv, substr): @create_binary_string_func("StringView.count", size_type) -def count_impl(st, substr): +def string_view_count_impl(st, substr): + return _string_view_count(st, substr) + + +@create_binary_string_func("UDFString.count", size_type) +def udf_string_count_impl(st, substr): return _string_view_count(st, substr) diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py index b8de821e101..7c3e0bd133b 100644 --- a/python/strings_udf/strings_udf/tests/test_string_udfs.py +++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py @@ -7,6 +7,7 @@ from numba import cuda from numba.core.typing import signature as nb_signature from numba.types import CPointer, void +from utils import sv_to_udf_str import cudf import rmm @@ -20,7 +21,7 @@ from strings_udf._typing import str_view_arg_handler, string_view, udf_string -def get_kernel(func, dtype, size): +def get_kernels(func, dtype, size): """ Create a kernel for testing a single scalar string function Allocates an output vector with a dtype specified by the caller @@ -39,14 +40,27 @@ def get_kernel(func, dtype, size): @cuda.jit( sig, link=[strings_udf.ptxpath], extensions=[str_view_arg_handler] ) - def kernel(input_strings, output_col): + def string_view_kernel(input_strings, output_col): + # test the string function with a string_view as input id = cuda.grid(1) if id < size: st = input_strings[id] result = func(st) output_col[id] = result - return kernel + @cuda.jit( + sig, link=[strings_udf.ptxpath], extensions=[str_view_arg_handler] + ) + def udf_string_kernel(input_strings, output_col): + # test the string function with a udf_string as input + id = cuda.grid(1) + if id < size: + st = input_strings[id] + st = sv_to_udf_str(st) + result = func(st) + output_col[id] = result + + return string_view_kernel, udf_string_kernel def run_udf_test(data, func, dtype): @@ -66,15 +80,22 @@ def run_udf_test(data, func, dtype): cudf_column = cudf.core.column.as_column(data) str_views = column_to_string_view_array(cudf_column) + sv_kernel, udf_str_kernel = get_kernels(func, dtype, len(data)) - kernel = get_kernel(func, dtype, len(data)) - kernel.forall(len(data))(str_views, output) + expect = pd.Series(data).apply(func) + sv_kernel.forall(len(data))(str_views, output) + if dtype == "str": + output = column_from_udf_string_array(output) + + got = cudf.Series(output, dtype=dtype) + assert_eq(expect, got, check_dtype=False) + + udf_str_kernel.forall(len(data))(str_views, output) if dtype == "str": output = column_from_udf_string_array(output) got = cudf.Series(output, dtype=dtype) - expect = pd.Series(data).apply(func) assert_eq(expect, got, check_dtype=False) diff --git a/python/strings_udf/strings_udf/tests/utils.py b/python/strings_udf/strings_udf/tests/utils.py new file mode 100644 index 00000000000..d37b358c2b6 --- /dev/null +++ b/python/strings_udf/strings_udf/tests/utils.py @@ -0,0 +1,26 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +import numba +from numba.core.typing.templates import AbstractTemplate +from numba.cuda.cudadecl import registry as cuda_decl_registry +from numba.cuda.cudaimpl import lower as cuda_lower + +from strings_udf._typing import StringView, string_view, udf_string +from strings_udf.lowering import cast_string_view_to_udf_string + + +def sv_to_udf_str(sv): + pass + + +@cuda_decl_registry.register_global(sv_to_udf_str) +class StringViewToUDFStringDecl(AbstractTemplate): + def generic(self, args, kws): + if isinstance(args[0], StringView) and len(args) == 1: + return numba.core.typing.signature(udf_string, string_view) + + +@cuda_lower(sv_to_udf_str, string_view) +def sv_to_udf_str_testing_lowering(context, builder, sig, args): + return cast_string_view_to_udf_string( + context, builder, sig.args[0], sig.return_type, args[0] + ) From 08e190e406750787b61bcedca01cbc516fd2e7ac Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 13 Dec 2022 10:03:10 -0800 Subject: [PATCH 02/14] replace standalone function works in strings_udf --- python/strings_udf/strings_udf/_typing.py | 10 +++++++--- python/strings_udf/strings_udf/lowering.py | 1 + .../strings_udf/strings_udf/tests/test_string_udfs.py | 9 ++++----- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index 1679780fe18..2065e0b368f 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -217,11 +217,12 @@ def resolve_replace(self, mod): return types.BoundFunction(StringViewReplace, string_view) -class UDFStringCount(AbstractTemplate): +class UDFStringCount(StringViewCount): key = "UDFString.count" - def generic(self, args, kws): - return nb_signature(size_type, string_view, recvr=self.this) + +class UDFStringReplace(StringViewReplace): + key = "UDFString.replace" @cuda_decl_registry.register_attr @@ -231,6 +232,9 @@ class UDFStringAttrs(AttributeTemplate): def resolve_count(self, mod): return types.BoundFunction(UDFStringCount, string_view) + def resolve_replace(self, mod): + return types.BoundFunction(UDFStringReplace, string_view) + # Build attributes for `MaskedType(string_view)` bool_binary_funcs = ["startswith", "endswith"] diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index de1e07764b0..1e3fc179d86 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -228,6 +228,7 @@ def call_string_view_replace(result, src, to_replace, replacement): return _string_view_replace(result, src, to_replace, replacement) +@cuda_lower("UDFString.replace", string_view, string_view, string_view) @cuda_lower("StringView.replace", string_view, string_view, string_view) def replace_impl(context, builder, sig, args): src_ptr = builder.alloca(args[0].type) diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py index 7c3e0bd133b..0c4b8eedd54 100644 --- a/python/strings_udf/strings_udf/tests/test_string_udfs.py +++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py @@ -86,16 +86,15 @@ def run_udf_test(data, func, dtype): sv_kernel.forall(len(data))(str_views, output) if dtype == "str": - output = column_from_udf_string_array(output) + result = column_from_udf_string_array(output) - got = cudf.Series(output, dtype=dtype) + got = cudf.Series(result, dtype=dtype) assert_eq(expect, got, check_dtype=False) - udf_str_kernel.forall(len(data))(str_views, output) if dtype == "str": - output = column_from_udf_string_array(output) + result = column_from_udf_string_array(output) - got = cudf.Series(output, dtype=dtype) + got = cudf.Series(result, dtype=dtype) assert_eq(expect, got, check_dtype=False) From c1bd41ca2eb8708f7c99187fcb6d8a57c4160bf6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 15 Dec 2022 11:06:27 -0800 Subject: [PATCH 03/14] strings_udf tests pass --- python/strings_udf/strings_udf/_typing.py | 66 +++++++++------ python/strings_udf/strings_udf/lowering.py | 84 ++++++++++++------- .../strings_udf/tests/test_string_udfs.py | 4 + 3 files changed, 100 insertions(+), 54 deletions(-) diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index 2065e0b368f..345a6240e9e 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -152,7 +152,7 @@ def generic(self, args, kws): cuda_decl_registry.register_global(op)(StringViewBinaryOp) -def create_binary_attr(attrname, retty): +def create_binary_attrs(attrname, retty): """ Helper function wrapping numba's low level extension API. Provides the boilerplate needed to register a binary function of two string @@ -165,13 +165,19 @@ class StringViewBinaryAttr(AbstractTemplate): def generic(self, args, kws): return nb_signature(retty, string_view, recvr=self.this) - def attr(self, mod): + class UDFStringBinaryAttr(StringViewBinaryAttr): + key = f"UDFString.{attrname}" + + def string_view_attr(self, mod): return types.BoundFunction(StringViewBinaryAttr, string_view) - return attr + def udf_string_attr(self, mod): + return types.BoundFunction(UDFStringBinaryAttr, string_view) + + return string_view_attr, udf_string_attr -def create_identifier_attr(attrname, retty): +def create_identifier_attrs(attrname, retty): """ Helper function wrapping numba's low level extension API. Provides the boilerplate needed to register a unary function of a string @@ -184,10 +190,16 @@ class StringViewIdentifierAttr(AbstractTemplate): def generic(self, args, kws): return nb_signature(retty, recvr=self.this) - def attr(self, mod): + class UDFStringIdentifierAttr(StringViewIdentifierAttr): + key = f"UDFString.{attrname}" + + def string_view_attr(self, mod): return types.BoundFunction(StringViewIdentifierAttr, string_view) - return attr + def udf_string_attr(self, mod): + return types.BoundFunction(UDFStringIdentifierAttr, string_view) + + return string_view_attr, udf_string_attr class StringViewCount(AbstractTemplate): @@ -254,40 +266,42 @@ def resolve_replace(self, mod): string_return_attrs = ["strip", "lstrip", "rstrip"] for func in bool_binary_funcs: - setattr( - StringViewAttrs, - f"resolve_{func}", - create_binary_attr(func, types.boolean), + string_view_attr, udf_string_attr = create_binary_attrs( + func, types.boolean ) + setattr(StringViewAttrs, f"resolve_{func}", string_view_attr) + setattr(UDFStringAttrs, f"resolve_{func}", udf_string_attr) + for func in string_return_attrs: - setattr( - StringViewAttrs, - f"resolve_{func}", - create_binary_attr(func, udf_string), - ) + string_view_attr, udf_string_attr = create_binary_attrs(func, udf_string) + setattr(StringViewAttrs, f"resolve_{func}", string_view_attr) + setattr(UDFStringAttrs, f"resolve_{func}", udf_string_attr) for func in int_binary_funcs: - setattr( - StringViewAttrs, f"resolve_{func}", create_binary_attr(func, size_type) - ) + string_view_attr, udf_string_attr = create_binary_attrs(func, size_type) + setattr(StringViewAttrs, f"resolve_{func}", string_view_attr) + setattr(UDFStringAttrs, f"resolve_{func}", udf_string_attr) + for func in id_unary_funcs: - setattr( - StringViewAttrs, - f"resolve_{func}", - create_identifier_attr(func, types.boolean), + string_view_attr, udf_string_attr = create_identifier_attrs( + func, types.boolean ) + setattr(StringViewAttrs, f"resolve_{func}", string_view_attr) + setattr(UDFStringAttrs, f"resolve_{func}", udf_string_attr) for func in string_unary_funcs: - setattr( - StringViewAttrs, - f"resolve_{func}", - create_identifier_attr(func, udf_string), + string_view_attr, udf_string_attr = create_identifier_attrs( + func, udf_string ) + setattr(StringViewAttrs, f"resolve_{func}", string_view_attr) + setattr(UDFStringAttrs, f"resolve_{func}", udf_string_attr) + cuda_decl_registry.register_attr(StringViewAttrs) +cuda_decl_registry.register_attr(UDFStringAttrs) register_stringview_binaryop(operator.eq, types.boolean) register_stringview_binaryop(operator.ne, types.boolean) diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index 1e3fc179d86..909c6265069 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -142,7 +142,7 @@ def cast_string_literal_to_string_view(context, builder, fromty, toty, val): @cuda_lowering_registry.lower_cast(udf_string, string_view) -def cast_string_view_to_udf_string(context, builder, fromty, toty, val): +def cast_udf_string_to_string_view(context, builder, fromty, toty, val): udf_str_ptr = builder.alloca(default_manager[fromty].get_value_type()) sv_ptr = builder.alloca(default_manager[toty].get_value_type()) builder.store(val, udf_str_ptr) @@ -161,6 +161,24 @@ def cast_string_view_to_udf_string(context, builder, fromty, toty, val): return result._getvalue() +@cuda_lowering_registry.lower_cast(string_view, udf_string) +def cast_string_view_to_udf_string(context, builder, fromty, toty, val): + sv_ptr = builder.alloca(default_manager[fromty].get_value_type()) + udf_str_ptr = builder.alloca(default_manager[toty].get_value_type()) + builder.store(val, sv_ptr) + _ = context.compile_internal( + builder, + call_create_udf_string_from_string_view, + nb_signature(types.void, _STR_VIEW_PTR, types.CPointer(udf_string)), + (sv_ptr, udf_str_ptr), + ) + result = cgutils.create_struct_proxy(udf_string)( + context, builder, value=builder.load(udf_str_ptr) + ) + + return result._getvalue() + + # utilities _create_udf_string_from_string_view = cuda.declare_device( "udf_string_from_string_view", @@ -186,6 +204,7 @@ def call_len_string_view(st): @cuda_lower(len, string_view) +@cuda_lower(len, udf_string) def len_impl(context, builder, sig, args): sv_ptr = builder.alloca(args[0].type) builder.store(args[0], sv_ptr) @@ -264,7 +283,6 @@ def create_binary_string_func(binary_func, retty): """ def deco(cuda_func): - @cuda_lower(binary_func, string_view, string_view) def binary_func_impl(context, builder, sig, args): lhs_ptr = builder.alloca(args[0].type) rhs_ptr = builder.alloca(args[1].type) @@ -306,6 +324,19 @@ def binary_func_impl(context, builder, sig, args): ) return result._getvalue() + # binary_func can be attribute-like: str.binary_func + # or operator-like: binary_func(str, other) + if isinstance(binary_func, str): + binary_func_impl = cuda_lower( + f"StringView.{binary_func}", string_view, string_view + )(binary_func_impl) + binary_func_impl = cuda_lower( + f"UDFString.{binary_func}", string_view, string_view + )(binary_func_impl) + else: + binary_func_impl = cuda_lower( + binary_func, string_view, string_view + )(binary_func_impl) return binary_func_impl return deco @@ -346,47 +377,42 @@ def lt_impl(st, rhs): return _string_view_lt(st, rhs) -@create_binary_string_func("StringView.strip", udf_string) +@create_binary_string_func("strip", udf_string) def strip_impl(result, to_strip, strip_char): return _string_view_strip(result, to_strip, strip_char) -@create_binary_string_func("StringView.lstrip", udf_string) +@create_binary_string_func("lstrip", udf_string) def lstrip_impl(result, to_strip, strip_char): return _string_view_lstrip(result, to_strip, strip_char) -@create_binary_string_func("StringView.rstrip", udf_string) +@create_binary_string_func("rstrip", udf_string) def rstrip_impl(result, to_strip, strip_char): return _string_view_rstrip(result, to_strip, strip_char) -@create_binary_string_func("StringView.startswith", types.boolean) +@create_binary_string_func("startswith", types.boolean) def startswith_impl(sv, substr): return _string_view_startswith(sv, substr) -@create_binary_string_func("StringView.endswith", types.boolean) +@create_binary_string_func("endswith", types.boolean) def endswith_impl(sv, substr): return _string_view_endswith(sv, substr) -@create_binary_string_func("StringView.count", size_type) +@create_binary_string_func("count", size_type) def string_view_count_impl(st, substr): return _string_view_count(st, substr) -@create_binary_string_func("UDFString.count", size_type) -def udf_string_count_impl(st, substr): - return _string_view_count(st, substr) - - -@create_binary_string_func("StringView.find", size_type) +@create_binary_string_func("find", size_type) def find_impl(sv, substr): return _string_view_find(sv, substr) -@create_binary_string_func("StringView.rfind", size_type) +@create_binary_string_func("rfind", size_type) def rfind_impl(sv, substr): return _string_view_rfind(sv, substr) @@ -399,7 +425,8 @@ def create_unary_identifier_func(id_func): """ def deco(cuda_func): - @cuda_lower(id_func, string_view) + @cuda_lower(f"StringView.{id_func}", string_view) + @cuda_lower(f"UDFString.{id_func}", string_view) def id_func_impl(context, builder, sig, args): str_ptr = builder.alloca(args[0].type) builder.store(args[0], str_ptr) @@ -432,7 +459,8 @@ def create_upper_or_lower(id_func): """ def deco(cuda_func): - @cuda_lower(id_func, string_view) + @cuda_lower(f"StringView.{id_func}", string_view) + @cuda_lower(f"UDFString.{id_func}", string_view) def id_func_impl(context, builder, sig, args): str_ptr = builder.alloca(args[0].type) builder.store(args[0], str_ptr) @@ -482,56 +510,56 @@ def id_func_impl(context, builder, sig, args): return deco -@create_upper_or_lower("StringView.upper") +@create_upper_or_lower("upper") def upper_impl(result, st, flags, cases, special): return _string_view_upper(result, st, flags, cases, special) -@create_upper_or_lower("StringView.lower") +@create_upper_or_lower("lower") def lower_impl(result, st, flags, cases, special): return _string_view_lower(result, st, flags, cases, special) -@create_unary_identifier_func("StringView.isdigit") +@create_unary_identifier_func("isdigit") def isdigit_impl(st, tbl): return _string_view_isdigit(st, tbl) -@create_unary_identifier_func("StringView.isalnum") +@create_unary_identifier_func("isalnum") def isalnum_impl(st, tbl): return _string_view_isalnum(st, tbl) -@create_unary_identifier_func("StringView.isalpha") +@create_unary_identifier_func("isalpha") def isalpha_impl(st, tbl): return _string_view_isalpha(st, tbl) -@create_unary_identifier_func("StringView.isnumeric") +@create_unary_identifier_func("isnumeric") def isnumeric_impl(st, tbl): return _string_view_isnumeric(st, tbl) -@create_unary_identifier_func("StringView.isdecimal") +@create_unary_identifier_func("isdecimal") def isdecimal_impl(st, tbl): return _string_view_isdecimal(st, tbl) -@create_unary_identifier_func("StringView.isspace") +@create_unary_identifier_func("isspace") def isspace_impl(st, tbl): return _string_view_isspace(st, tbl) -@create_unary_identifier_func("StringView.isupper") +@create_unary_identifier_func("isupper") def isupper_impl(st, tbl): return _string_view_isupper(st, tbl) -@create_unary_identifier_func("StringView.islower") +@create_unary_identifier_func("islower") def islower_impl(st, tbl): return _string_view_islower(st, tbl) -@create_unary_identifier_func("StringView.istitle") +@create_unary_identifier_func("istitle") def istitle_impl(st, tbl): return _string_view_istitle(st, tbl) diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py index 0c4b8eedd54..29e37e945ee 100644 --- a/python/strings_udf/strings_udf/tests/test_string_udfs.py +++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py @@ -87,12 +87,16 @@ def run_udf_test(data, func, dtype): sv_kernel.forall(len(data))(str_views, output) if dtype == "str": result = column_from_udf_string_array(output) + else: + result = output got = cudf.Series(result, dtype=dtype) assert_eq(expect, got, check_dtype=False) udf_str_kernel.forall(len(data))(str_views, output) if dtype == "str": result = column_from_udf_string_array(output) + else: + result = output got = cudf.Series(result, dtype=dtype) assert_eq(expect, got, check_dtype=False) From 3f043d6814a9b762263ec49fe43197ed858e81cd Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 16 Dec 2022 10:01:43 -0800 Subject: [PATCH 04/14] all tests pass --- python/cudf/cudf/core/udf/strings_lowering.py | 1 + python/cudf/cudf/core/udf/strings_typing.py | 16 +- python/cudf/cudf/tests/test_udf_masked_ops.py | 177 +++++++++++------- python/strings_udf/strings_udf/_typing.py | 12 ++ python/strings_udf/strings_udf/lowering.py | 16 +- 5 files changed, 146 insertions(+), 76 deletions(-) diff --git a/python/cudf/cudf/core/udf/strings_lowering.py b/python/cudf/cudf/core/udf/strings_lowering.py index ec956cdd65d..f54f173d5a9 100644 --- a/python/cudf/cudf/core/udf/strings_lowering.py +++ b/python/cudf/cudf/core/udf/strings_lowering.py @@ -36,6 +36,7 @@ @cuda_lower(len, MaskedType(string_view)) +@cuda_lower(len, MaskedType(udf_string)) def masked_len_impl(context, builder, sig, args): ret = cgutils.create_struct_proxy(sig.return_type)(context, builder) masked_sv_ty = sig.args[0] diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py index e373b8b018d..f8b76594e82 100644 --- a/python/cudf/cudf/core/udf/strings_typing.py +++ b/python/cudf/cudf/core/udf/strings_typing.py @@ -9,6 +9,7 @@ from strings_udf._typing import ( StringView, + UDFString, bool_binary_funcs, id_unary_funcs, int_binary_funcs, @@ -25,11 +26,13 @@ masked_typing.MASKED_INIT_MAP[types.pyobject] = string_view masked_typing.MASKED_INIT_MAP[string_view] = string_view +masked_typing.MASKED_INIT_MAP[udf_string] = udf_string def _is_valid_string_arg(ty): return ( - isinstance(ty, MaskedType) and isinstance(ty.value_type, StringView) + isinstance(ty, MaskedType) + and isinstance(ty.value_type, (StringView, UDFString)) ) or isinstance(ty, types.StringLiteral) @@ -53,7 +56,7 @@ class MaskedStringFunction(AbstractTemplate): @register_string_function(len) def len_typing(self, args, kws): if isinstance(args[0], MaskedType) and isinstance( - args[0].value_type, StringView + args[0].value_type, (StringView, UDFString) ): return nb_signature(MaskedType(size_type), args[0]) elif isinstance(args[0], types.StringLiteral) and len(args) == 1: @@ -223,4 +226,13 @@ def resolve_valid(self, mod): create_masked_unary_attr(f"MaskedType.{func}", udf_string), ) + +class MaskedUDFStringAttrs(MaskedStringViewAttrs): + key = MaskedType(udf_string) + + def resolve_value(self, mod): + return udf_string + + cuda_decl_registry.register_attr(MaskedStringViewAttrs) +cuda_decl_registry.register_attr(MaskedUDFStringAttrs) diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index e3b7e62433e..30d42dfc545 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -15,6 +15,7 @@ comparison_ops, unary_ops, ) +from cudf.core.udf.api import Masked from cudf.core.udf.utils import precompiled from cudf.testing._utils import ( _decimal_series, @@ -80,6 +81,38 @@ def run_masked_udf_test(func, data, args=(), **kwargs): assert_eq(expect, obtain, **kwargs) +def run_masked_string_udf_test(func, data, args=(), **kwargs): + from strings_udf._typing import sv_to_udf_str + + gdf = data + pdf = data.to_pandas(nullable=True) + + def row_wrapper(row): + st = row["str_col"] + return func(st) + + expect = pdf.apply(row_wrapper, args=args, axis=1) + + func = cuda.jit(device=True)(func) + obtain = gdf.apply(row_wrapper, args=args, axis=1) + assert_eq(expect, obtain, **kwargs) + + # strings that come directly from input columns are backed by + # MaskedType(string_view) types. But new strings that are returned + # from functions or operators are backed by MaskedType(udf_string) + # types. We need to make sure all of our methods work on both kind + # of MaskedType. This function promotes the former to the latter + # prior to running the input function + def udf_string_wrapper(row): + masked_udf_str = Masked( + sv_to_udf_str(row["str_col"].value), row["str_col"].valid + ) + return func(masked_udf_str) + + obtain = gdf.apply(udf_string_wrapper, args=args, axis=1) + assert_eq(expect, obtain, **kwargs) + + def run_masked_udf_series(func, data, args=(), **kwargs): gsr = data psr = data.to_pandas(nullable=True) @@ -740,202 +773,202 @@ def f(x): @string_udf_test def test_string_udf_len(str_udf_data): - def func(row): - return len(row["str_col"]) + def func(st): + return len(st) - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_startswith(str_udf_data, substr): - def func(row): - return row["str_col"].startswith(substr) + def func(st): + return st.startswith(substr) - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_endswith(str_udf_data, substr): - def func(row): - return row["str_col"].endswith(substr) + def func(st): + return st.endswith(substr) - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_find(str_udf_data, substr): - def func(row): - return row["str_col"].find(substr) + def func(st): + return st.find(substr) - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_rfind(str_udf_data, substr): - def func(row): - return row["str_col"].rfind(substr) + def func(st): + return st.rfind(substr) - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_contains(str_udf_data, substr): - def func(row): - return substr in row["str_col"] + def func(st): + return substr in st - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test @pytest.mark.parametrize("other", ["cudf", "123", "", " "]) @pytest.mark.parametrize("cmpop", comparison_ops) def test_string_udf_cmpops(str_udf_data, other, cmpop): - def func(row): - return cmpop(row["str_col"], other) + def func(st): + return cmpop(st, other) - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_isalnum(str_udf_data): - def func(row): - return row["str_col"].isalnum() + def func(st): + return st.isalnum() - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_isalpha(str_udf_data): - def func(row): - return row["str_col"].isalpha() + def func(st): + return st.isalpha() - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_isdigit(str_udf_data): - def func(row): - return row["str_col"].isdigit() + def func(st): + return st.isdigit() - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_isdecimal(str_udf_data): - def func(row): - return row["str_col"].isdecimal() + def func(st): + return st.isdecimal() - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_isupper(str_udf_data): - def func(row): - return row["str_col"].isupper() + def func(st): + return st.isupper() - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_islower(str_udf_data): - def func(row): - return row["str_col"].islower() + def func(st): + return st.islower() - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_isspace(str_udf_data): - def func(row): - return row["str_col"].isspace() + def func(st): + return st.isspace() - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_istitle(str_udf_data): - def func(row): - return row["str_col"].istitle() + def func(st): + return st.istitle() - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_count(str_udf_data, substr): - def func(row): - return row["str_col"].count(substr) + def func(st): + return st.count(substr) - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_return_string(str_udf_data): - def func(row): - return row["str_col"] + def func(st): + return st - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test @pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) def test_string_udf_strip(str_udf_data, strip_char): - def func(row): - return row["str_col"].strip(strip_char) + def func(st): + return st.strip(strip_char) - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test @pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) def test_string_udf_lstrip(str_udf_data, strip_char): - def func(row): - return row["str_col"].lstrip(strip_char) + def func(st): + return st.lstrip(strip_char) - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test @pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) def test_string_udf_rstrip(str_udf_data, strip_char): - def func(row): - return row["str_col"].rstrip(strip_char) + def func(st): + return st.rstrip(strip_char) - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_upper(str_udf_data): - def func(row): - return row["str_col"].upper() + def func(st): + return st.upper() - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test def test_string_udf_lower(str_udf_data): - def func(row): - return row["str_col"].lower() + def func(st): + return st.lower() - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test @pytest.mark.parametrize("concat_char", ["1", "a", "12", " ", "", ".", "@"]) def test_string_udf_concat(str_udf_data, concat_char): - def func(row): - return row["str_col"] + concat_char + def func(st): + return st + concat_char - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @string_udf_test @pytest.mark.parametrize("to_replace", ["a", "1", "", "@"]) @pytest.mark.parametrize("replacement", ["a", "1", "", "@"]) def test_string_udf_replace(str_udf_data, to_replace, replacement): - def func(row): - return row["str_col"].replace(to_replace, replacement) + def func(st): + return st.replace(to_replace, replacement) - run_masked_udf_test(func, str_udf_data, check_dtype=False) + run_masked_string_udf_test(func, str_udf_data, check_dtype=False) @pytest.mark.parametrize( diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index 345a6240e9e..0e3fffbca45 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -119,6 +119,18 @@ def prepare_args(self, ty, val, **kwargs): str_view_arg_handler = StrViewArgHandler() +# for use in testing only +def sv_to_udf_str(sv): + pass + + +@cuda_decl_registry.register_global(sv_to_udf_str) +class StringViewToUDFStringDecl(AbstractTemplate): + def generic(self, args, kws): + if isinstance(args[0], StringView) and len(args) == 1: + return nb_signature(udf_string, string_view) + + # String functions @cuda_decl_registry.register_global(len) class StringLength(AbstractTemplate): diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index 909c6265069..3635407285c 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -18,7 +18,12 @@ get_character_flags_table_ptr, get_special_case_mapping_table_ptr, ) -from strings_udf._typing import size_type, string_view, udf_string +from strings_udf._typing import ( + size_type, + string_view, + sv_to_udf_str, + udf_string, +) character_flags_table_ptr = get_character_flags_table_ptr() character_cases_table_ptr = get_character_cases_table_ptr() @@ -403,7 +408,7 @@ def endswith_impl(sv, substr): @create_binary_string_func("count", size_type) -def string_view_count_impl(st, substr): +def count_impl(st, substr): return _string_view_count(st, substr) @@ -563,3 +568,10 @@ def islower_impl(st, tbl): @create_unary_identifier_func("istitle") def istitle_impl(st, tbl): return _string_view_istitle(st, tbl) + + +@cuda_lower(sv_to_udf_str, string_view) +def sv_to_udf_str_testing_lowering(context, builder, sig, args): + return cast_string_view_to_udf_string( + context, builder, sig.args[0], sig.return_type, args[0] + ) From 8c8eb85ab8db18e78999dad0fae8a6a631f11e8a Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 5 Jan 2023 07:33:03 -0800 Subject: [PATCH 05/14] update copyright years --- python/cudf/cudf/core/udf/strings_lowering.py | 2 +- python/cudf/cudf/core/udf/strings_typing.py | 2 +- python/cudf/cudf/tests/test_udf_masked_ops.py | 2 +- python/strings_udf/cpp/src/strings/udf/shim.cu | 2 +- python/strings_udf/strings_udf/_typing.py | 2 +- python/strings_udf/strings_udf/lowering.py | 2 +- python/strings_udf/strings_udf/tests/test_string_udfs.py | 2 +- python/strings_udf/strings_udf/tests/utils.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/udf/strings_lowering.py b/python/cudf/cudf/core/udf/strings_lowering.py index f54f173d5a9..60e74678680 100644 --- a/python/cudf/cudf/core/udf/strings_lowering.py +++ b/python/cudf/cudf/core/udf/strings_lowering.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. import operator diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py index f8b76594e82..a98359c29ce 100644 --- a/python/cudf/cudf/core/udf/strings_typing.py +++ b/python/cudf/cudf/core/udf/strings_typing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. import operator diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index d2e73e03083..89a93c822a5 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import math import operator diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu index 13815f1d9c9..5993b8fa3b6 100644 --- a/python/strings_udf/cpp/src/strings/udf/shim.cu +++ b/python/strings_udf/cpp/src/strings/udf/shim.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index 0e3fffbca45..fd7c320d342 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. import operator diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index 48d3de8a31e..2c915a4f758 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. import operator from functools import partial diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py index 29e37e945ee..10d2a44aaa9 100644 --- a/python/strings_udf/strings_udf/tests/test_string_udfs.py +++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. import numba import numpy as np diff --git a/python/strings_udf/strings_udf/tests/utils.py b/python/strings_udf/strings_udf/tests/utils.py index d37b358c2b6..cbf7c2b0d05 100644 --- a/python/strings_udf/strings_udf/tests/utils.py +++ b/python/strings_udf/strings_udf/tests/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. import numba from numba.core.typing.templates import AbstractTemplate from numba.cuda.cudadecl import registry as cuda_decl_registry From eb44719023092836eb524b72a6d985c95afa4ab2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 9 Jan 2023 12:33:50 -0800 Subject: [PATCH 06/14] make things much easier --- python/strings_udf/strings_udf/_typing.py | 61 +++++------------------ 1 file changed, 12 insertions(+), 49 deletions(-) diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index fd7c320d342..132dffc3e55 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -177,16 +177,10 @@ class StringViewBinaryAttr(AbstractTemplate): def generic(self, args, kws): return nb_signature(retty, string_view, recvr=self.this) - class UDFStringBinaryAttr(StringViewBinaryAttr): - key = f"UDFString.{attrname}" - def string_view_attr(self, mod): return types.BoundFunction(StringViewBinaryAttr, string_view) - def udf_string_attr(self, mod): - return types.BoundFunction(UDFStringBinaryAttr, string_view) - - return string_view_attr, udf_string_attr + return string_view_attr def create_identifier_attrs(attrname, retty): @@ -202,16 +196,10 @@ class StringViewIdentifierAttr(AbstractTemplate): def generic(self, args, kws): return nb_signature(retty, recvr=self.this) - class UDFStringIdentifierAttr(StringViewIdentifierAttr): - key = f"UDFString.{attrname}" - def string_view_attr(self, mod): return types.BoundFunction(StringViewIdentifierAttr, string_view) - def udf_string_attr(self, mod): - return types.BoundFunction(UDFStringIdentifierAttr, string_view) - - return string_view_attr, udf_string_attr + return string_view_attr class StringViewCount(AbstractTemplate): @@ -241,25 +229,6 @@ def resolve_replace(self, mod): return types.BoundFunction(StringViewReplace, string_view) -class UDFStringCount(StringViewCount): - key = "UDFString.count" - - -class UDFStringReplace(StringViewReplace): - key = "UDFString.replace" - - -@cuda_decl_registry.register_attr -class UDFStringAttrs(AttributeTemplate): - key = udf_string - - def resolve_count(self, mod): - return types.BoundFunction(UDFStringCount, string_view) - - def resolve_replace(self, mod): - return types.BoundFunction(UDFStringReplace, string_view) - - # Build attributes for `MaskedType(string_view)` bool_binary_funcs = ["startswith", "endswith"] int_binary_funcs = ["find", "rfind"] @@ -278,38 +247,32 @@ def resolve_replace(self, mod): string_return_attrs = ["strip", "lstrip", "rstrip"] for func in bool_binary_funcs: - string_view_attr, udf_string_attr = create_binary_attrs( - func, types.boolean - ) + string_view_attr = create_binary_attrs(func, types.boolean) setattr(StringViewAttrs, f"resolve_{func}", string_view_attr) - setattr(UDFStringAttrs, f"resolve_{func}", udf_string_attr) for func in string_return_attrs: - string_view_attr, udf_string_attr = create_binary_attrs(func, udf_string) + string_view_attr = create_binary_attrs(func, udf_string) setattr(StringViewAttrs, f"resolve_{func}", string_view_attr) - setattr(UDFStringAttrs, f"resolve_{func}", udf_string_attr) for func in int_binary_funcs: - string_view_attr, udf_string_attr = create_binary_attrs(func, size_type) + string_view_attr = create_binary_attrs(func, size_type) setattr(StringViewAttrs, f"resolve_{func}", string_view_attr) - setattr(UDFStringAttrs, f"resolve_{func}", udf_string_attr) for func in id_unary_funcs: - string_view_attr, udf_string_attr = create_identifier_attrs( - func, types.boolean - ) + string_view_attr = create_identifier_attrs(func, types.boolean) setattr(StringViewAttrs, f"resolve_{func}", string_view_attr) - setattr(UDFStringAttrs, f"resolve_{func}", udf_string_attr) for func in string_unary_funcs: - string_view_attr, udf_string_attr = create_identifier_attrs( - func, udf_string - ) + string_view_attr = create_identifier_attrs(func, udf_string) setattr(StringViewAttrs, f"resolve_{func}", string_view_attr) - setattr(UDFStringAttrs, f"resolve_{func}", udf_string_attr) + + +@cuda_decl_registry.register_attr +class UDFStringAttrs(StringViewAttrs): + key = udf_string cuda_decl_registry.register_attr(StringViewAttrs) From 59b42eab9f2c803be43948c19a208718b33dc544 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 9 Jan 2023 12:36:08 -0800 Subject: [PATCH 07/14] refactor --- python/strings_udf/strings_udf/_typing.py | 35 ++++++++++++++++------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index 132dffc3e55..8b113d890a3 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -247,27 +247,42 @@ def resolve_replace(self, mod): string_return_attrs = ["strip", "lstrip", "rstrip"] for func in bool_binary_funcs: - string_view_attr = create_binary_attrs(func, types.boolean) - setattr(StringViewAttrs, f"resolve_{func}", string_view_attr) + setattr( + StringViewAttrs, + f"resolve_{func}", + create_binary_attrs(func, types.boolean), + ) for func in string_return_attrs: - string_view_attr = create_binary_attrs(func, udf_string) - setattr(StringViewAttrs, f"resolve_{func}", string_view_attr) + setattr( + StringViewAttrs, + f"resolve_{func}", + create_binary_attrs(func, udf_string), + ) for func in int_binary_funcs: - string_view_attr = create_binary_attrs(func, size_type) - setattr(StringViewAttrs, f"resolve_{func}", string_view_attr) + setattr( + StringViewAttrs, + f"resolve_{func}", + create_binary_attrs(func, size_type), + ) for func in id_unary_funcs: - string_view_attr = create_identifier_attrs(func, types.boolean) - setattr(StringViewAttrs, f"resolve_{func}", string_view_attr) + setattr( + StringViewAttrs, + f"resolve_{func}", + create_identifier_attrs(func, types.boolean), + ) for func in string_unary_funcs: - string_view_attr = create_identifier_attrs(func, udf_string) - setattr(StringViewAttrs, f"resolve_{func}", string_view_attr) + setattr( + StringViewAttrs, + f"resolve_{func}", + create_identifier_attrs(func, udf_string), + ) @cuda_decl_registry.register_attr From 599033f0b139d730f355399334b6d3cf2a2c019d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 9 Jan 2023 12:37:35 -0800 Subject: [PATCH 08/14] continue reverting changes --- python/strings_udf/strings_udf/_typing.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index 8b113d890a3..f778dd9a2b4 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -164,7 +164,7 @@ def generic(self, args, kws): cuda_decl_registry.register_global(op)(StringViewBinaryOp) -def create_binary_attrs(attrname, retty): +def create_binary_attr(attrname, retty): """ Helper function wrapping numba's low level extension API. Provides the boilerplate needed to register a binary function of two string @@ -177,13 +177,13 @@ class StringViewBinaryAttr(AbstractTemplate): def generic(self, args, kws): return nb_signature(retty, string_view, recvr=self.this) - def string_view_attr(self, mod): + def attr(self, mod): return types.BoundFunction(StringViewBinaryAttr, string_view) - return string_view_attr + return attr -def create_identifier_attrs(attrname, retty): +def create_identifier_attr(attrname, retty): """ Helper function wrapping numba's low level extension API. Provides the boilerplate needed to register a unary function of a string @@ -196,10 +196,10 @@ class StringViewIdentifierAttr(AbstractTemplate): def generic(self, args, kws): return nb_signature(retty, recvr=self.this) - def string_view_attr(self, mod): + def attr(self, mod): return types.BoundFunction(StringViewIdentifierAttr, string_view) - return string_view_attr + return attr class StringViewCount(AbstractTemplate): @@ -250,7 +250,7 @@ def resolve_replace(self, mod): setattr( StringViewAttrs, f"resolve_{func}", - create_binary_attrs(func, types.boolean), + create_binary_attr(func, types.boolean), ) @@ -258,7 +258,7 @@ def resolve_replace(self, mod): setattr( StringViewAttrs, f"resolve_{func}", - create_binary_attrs(func, udf_string), + create_binary_attr(func, udf_string), ) @@ -266,7 +266,7 @@ def resolve_replace(self, mod): setattr( StringViewAttrs, f"resolve_{func}", - create_binary_attrs(func, size_type), + create_binary_attr(func, size_type), ) @@ -274,14 +274,14 @@ def resolve_replace(self, mod): setattr( StringViewAttrs, f"resolve_{func}", - create_identifier_attrs(func, types.boolean), + create_identifier_attr(func, types.boolean), ) for func in string_unary_funcs: setattr( StringViewAttrs, f"resolve_{func}", - create_identifier_attrs(func, udf_string), + create_identifier_attr(func, udf_string), ) From b163c1b6bdff0bd7f194234f12741933491a0c4a Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 30 Jan 2023 11:59:04 -0800 Subject: [PATCH 09/14] cast in len --- python/cudf/cudf/core/udf/strings_typing.py | 2 +- python/strings_udf/strings_udf/_typing.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py index a98359c29ce..c40ae9e66a2 100644 --- a/python/cudf/cudf/core/udf/strings_typing.py +++ b/python/cudf/cudf/core/udf/strings_typing.py @@ -58,7 +58,7 @@ def len_typing(self, args, kws): if isinstance(args[0], MaskedType) and isinstance( args[0].value_type, (StringView, UDFString) ): - return nb_signature(MaskedType(size_type), args[0]) + return nb_signature(MaskedType(size_type), string_view) elif isinstance(args[0], types.StringLiteral) and len(args) == 1: return nb_signature(size_type, args[0]) diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index fe3fc9d8ddf..9fff2e55806 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -137,7 +137,7 @@ def generic(self, args, kws): # string_view -> int32 # udf_string -> int32 # literal -> int32 - return nb_signature(size_type, args[0]) + return nb_signature(size_type, string_view) def register_stringview_binaryop(op, retty): From 21a2121d20aa1816102463b687ab6c68bfc5f9be Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 30 Jan 2023 12:14:34 -0800 Subject: [PATCH 10/14] small typing bug --- python/cudf/cudf/core/udf/strings_typing.py | 2 +- python/strings_udf/strings_udf/tests/utils.py | 26 ------------------- 2 files changed, 1 insertion(+), 27 deletions(-) delete mode 100644 python/strings_udf/strings_udf/tests/utils.py diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py index c40ae9e66a2..1c9e582e7ad 100644 --- a/python/cudf/cudf/core/udf/strings_typing.py +++ b/python/cudf/cudf/core/udf/strings_typing.py @@ -58,7 +58,7 @@ def len_typing(self, args, kws): if isinstance(args[0], MaskedType) and isinstance( args[0].value_type, (StringView, UDFString) ): - return nb_signature(MaskedType(size_type), string_view) + return nb_signature(MaskedType(size_type), MaskedType(string_view)) elif isinstance(args[0], types.StringLiteral) and len(args) == 1: return nb_signature(size_type, args[0]) diff --git a/python/strings_udf/strings_udf/tests/utils.py b/python/strings_udf/strings_udf/tests/utils.py deleted file mode 100644 index cbf7c2b0d05..00000000000 --- a/python/strings_udf/strings_udf/tests/utils.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. -import numba -from numba.core.typing.templates import AbstractTemplate -from numba.cuda.cudadecl import registry as cuda_decl_registry -from numba.cuda.cudaimpl import lower as cuda_lower - -from strings_udf._typing import StringView, string_view, udf_string -from strings_udf.lowering import cast_string_view_to_udf_string - - -def sv_to_udf_str(sv): - pass - - -@cuda_decl_registry.register_global(sv_to_udf_str) -class StringViewToUDFStringDecl(AbstractTemplate): - def generic(self, args, kws): - if isinstance(args[0], StringView) and len(args) == 1: - return numba.core.typing.signature(udf_string, string_view) - - -@cuda_lower(sv_to_udf_str, string_view) -def sv_to_udf_str_testing_lowering(context, builder, sig, args): - return cast_string_view_to_udf_string( - context, builder, sig.args[0], sig.return_type, args[0] - ) From 8a55a26f663e2d0ef9dafc4acc6da35260fc5157 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 30 Jan 2023 12:15:15 -0800 Subject: [PATCH 11/14] remove duplicate code --- python/cudf/cudf/tests/test_udf_masked_ops.py | 2 +- python/strings_udf/strings_udf/_testing.py | 26 +++++++++++++++++++ python/strings_udf/strings_udf/_typing.py | 12 --------- python/strings_udf/strings_udf/lowering.py | 14 +--------- .../strings_udf/tests/test_string_udfs.py | 2 +- 5 files changed, 29 insertions(+), 27 deletions(-) create mode 100644 python/strings_udf/strings_udf/_testing.py diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index 89a93c822a5..7bbe39d228d 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -74,7 +74,7 @@ def run_masked_udf_test(func, data, args=(), **kwargs): def run_masked_string_udf_test(func, data, args=(), **kwargs): - from strings_udf._typing import sv_to_udf_str + from strings_udf._testing import sv_to_udf_str gdf = data pdf = data.to_pandas(nullable=True) diff --git a/python/strings_udf/strings_udf/_testing.py b/python/strings_udf/strings_udf/_testing.py new file mode 100644 index 00000000000..7be398bae76 --- /dev/null +++ b/python/strings_udf/strings_udf/_testing.py @@ -0,0 +1,26 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +from numba.core.typing import signature as nb_signature +from numba.core.typing.templates import AbstractTemplate +from numba.cuda.cudadecl import registry as cuda_decl_registry +from numba.cuda.cudaimpl import lower as cuda_lower + +from strings_udf._typing import StringView, string_view, udf_string +from strings_udf.lowering import cast_string_view_to_udf_string + + +def sv_to_udf_str(sv): + pass + + +@cuda_decl_registry.register_global(sv_to_udf_str) +class StringViewToUDFStringDecl(AbstractTemplate): + def generic(self, args, kws): + if isinstance(args[0], StringView) and len(args) == 1: + return nb_signature(udf_string, string_view) + + +@cuda_lower(sv_to_udf_str, string_view) +def sv_to_udf_str_testing_lowering(context, builder, sig, args): + return cast_string_view_to_udf_string( + context, builder, sig.args[0], sig.return_type, args[0] + ) diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index 9fff2e55806..3e7aea19043 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -112,18 +112,6 @@ def prepare_args(self, ty, val, **kwargs): str_view_arg_handler = StrViewArgHandler() -# for use in testing only -def sv_to_udf_str(sv): - pass - - -@cuda_decl_registry.register_global(sv_to_udf_str) -class StringViewToUDFStringDecl(AbstractTemplate): - def generic(self, args, kws): - if isinstance(args[0], StringView) and len(args) == 1: - return nb_signature(udf_string, string_view) - - # String functions @cuda_decl_registry.register_global(len) class StringLength(AbstractTemplate): diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index 2c915a4f758..10500ff1ec0 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -18,12 +18,7 @@ get_character_flags_table_ptr, get_special_case_mapping_table_ptr, ) -from strings_udf._typing import ( - size_type, - string_view, - sv_to_udf_str, - udf_string, -) +from strings_udf._typing import size_type, string_view, udf_string _STR_VIEW_PTR = types.CPointer(string_view) _UDF_STRING_PTR = types.CPointer(udf_string) @@ -564,10 +559,3 @@ def islower_impl(st, tbl): @create_unary_identifier_func("istitle") def istitle_impl(st, tbl): return _string_view_istitle(st, tbl) - - -@cuda_lower(sv_to_udf_str, string_view) -def sv_to_udf_str_testing_lowering(context, builder, sig, args): - return cast_string_view_to_udf_string( - context, builder, sig.args[0], sig.return_type, args[0] - ) diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py index 10d2a44aaa9..965799bbd3e 100644 --- a/python/strings_udf/strings_udf/tests/test_string_udfs.py +++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py @@ -7,7 +7,6 @@ from numba import cuda from numba.core.typing import signature as nb_signature from numba.types import CPointer, void -from utils import sv_to_udf_str import cudf import rmm @@ -18,6 +17,7 @@ column_from_udf_string_array, column_to_string_view_array, ) +from strings_udf._testing import sv_to_udf_str from strings_udf._typing import str_view_arg_handler, string_view, udf_string From 26a6f887ce6077c4fca0826d157cd7ce0c314610 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 22 Feb 2023 10:00:34 -0800 Subject: [PATCH 12/14] remove old file --- python/strings_udf/strings_udf/_testing.py | 26 ---------------------- 1 file changed, 26 deletions(-) delete mode 100644 python/strings_udf/strings_udf/_testing.py diff --git a/python/strings_udf/strings_udf/_testing.py b/python/strings_udf/strings_udf/_testing.py deleted file mode 100644 index 7be398bae76..00000000000 --- a/python/strings_udf/strings_udf/_testing.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. -from numba.core.typing import signature as nb_signature -from numba.core.typing.templates import AbstractTemplate -from numba.cuda.cudadecl import registry as cuda_decl_registry -from numba.cuda.cudaimpl import lower as cuda_lower - -from strings_udf._typing import StringView, string_view, udf_string -from strings_udf.lowering import cast_string_view_to_udf_string - - -def sv_to_udf_str(sv): - pass - - -@cuda_decl_registry.register_global(sv_to_udf_str) -class StringViewToUDFStringDecl(AbstractTemplate): - def generic(self, args, kws): - if isinstance(args[0], StringView) and len(args) == 1: - return nb_signature(udf_string, string_view) - - -@cuda_lower(sv_to_udf_str, string_view) -def sv_to_udf_str_testing_lowering(context, builder, sig, args): - return cast_string_view_to_udf_string( - context, builder, sig.args[0], sig.return_type, args[0] - ) From 254a583a58a38ef574a3d2114a1586c7acb6cc3f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 1 Mar 2023 07:12:18 -0800 Subject: [PATCH 13/14] address reviews --- python/cudf/cudf/core/udf/strings_lowering.py | 3 +- python/cudf/cudf/core/udf/strings_typing.py | 89 +++++++++++-------- python/cudf/cudf/tests/test_string_udfs.py | 6 +- 3 files changed, 55 insertions(+), 43 deletions(-) diff --git a/python/cudf/cudf/core/udf/strings_lowering.py b/python/cudf/cudf/core/udf/strings_lowering.py index 6086f3a2069..a53722f505d 100644 --- a/python/cudf/cudf/core/udf/strings_lowering.py +++ b/python/cudf/cudf/core/udf/strings_lowering.py @@ -162,7 +162,7 @@ def cast_udf_string_to_string_view(context, builder, fromty, toty, val): sv_ptr = builder.alloca(default_manager[toty].get_value_type()) builder.store(val, udf_str_ptr) - _ = context.compile_internal( + context.compile_internal( builder, call_create_string_view_from_udf_string, nb_signature(types.void, _UDF_STRING_PTR, _STR_VIEW_PTR), @@ -201,7 +201,6 @@ def call_len_string_view(st): @cuda_lower(len, string_view) -@cuda_lower(len, udf_string) def len_impl(context, builder, sig, args): sv_ptr = builder.alloca(args[0].type) builder.store(args[0], sv_ptr) diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py index 8b239f4b806..50d34be40a0 100644 --- a/python/cudf/cudf/core/udf/strings_typing.py +++ b/python/cudf/cudf/core/udf/strings_typing.py @@ -15,23 +15,6 @@ size_type = types.int32 -bool_binary_funcs = ["startswith", "endswith"] -int_binary_funcs = ["find", "rfind"] -id_unary_funcs = [ - "isalpha", - "isalnum", - "isdecimal", - "isdigit", - "isupper", - "islower", - "isspace", - "isnumeric", - "istitle", -] -string_unary_funcs = ["upper", "lower"] -string_return_attrs = ["strip", "lstrip", "rstrip"] - - # String object definitions class UDFString(types.Type): @@ -217,34 +200,62 @@ def generic(self, args, kws): class StringViewAttrs(AttributeTemplate): key = string_view - resolve_startswith = create_binary_attr("startswith", types.boolean) - resolve_endswith = create_binary_attr("endswith", types.boolean) + def resolve_count(self, mod): + return types.BoundFunction(StringViewCount, string_view) + + def resolve_replace(self, mod): + return types.BoundFunction(StringViewReplace, string_view) + + +bool_binary_funcs = ["startswith", "endswith"] +int_binary_funcs = ["find", "rfind"] +id_unary_funcs = [ + "isalpha", + "isalnum", + "isdecimal", + "isdigit", + "isupper", + "islower", + "isspace", + "isnumeric", + "istitle", +] +string_unary_funcs = ["upper", "lower"] +string_return_attrs = ["strip", "lstrip", "rstrip"] - resolve_strip = create_binary_attr("strip", udf_string) - resolve_lstrip = create_binary_attr("lstrip", udf_string) - resolve_rstrip = create_binary_attr("rstrip", udf_string) +for func in bool_binary_funcs: + setattr( + StringViewAttrs, + f"resolve_{func}", + create_binary_attr(func, types.boolean), + ) - resolve_find = create_binary_attr("find", size_type) - resolve_rfind = create_binary_attr("rfind", size_type) +for func in string_return_attrs: + setattr( + StringViewAttrs, + f"resolve_{func}", + create_binary_attr(func, udf_string), + ) - resolve_isalpha = create_identifier_attr("isalpha", types.boolean) - resolve_isalnum = create_identifier_attr("isalnum", types.boolean) - resolve_isdecimal = create_identifier_attr("isdecimal", types.boolean) - resolve_isdigit = create_identifier_attr("isdigit", types.boolean) - resolve_isupper = create_identifier_attr("isupper", types.boolean) - resolve_islower = create_identifier_attr("islower", types.boolean) - resolve_isspace = create_identifier_attr("isspace", types.boolean) - resolve_isnumeric = create_identifier_attr("isnumeric", types.boolean) - resolve_istitle = create_identifier_attr("istitle", types.boolean) - resolve_upper = create_identifier_attr("upper", udf_string) - resolve_lower = create_identifier_attr("lower", udf_string) +for func in int_binary_funcs: + setattr( + StringViewAttrs, f"resolve_{func}", create_binary_attr(func, size_type) + ) - def resolve_count(self, mod): - return types.BoundFunction(StringViewCount, string_view) +for func in id_unary_funcs: + setattr( + StringViewAttrs, + f"resolve_{func}", + create_identifier_attr(func, types.boolean), + ) - def resolve_replace(self, mod): - return types.BoundFunction(StringViewReplace, string_view) +for func in string_unary_funcs: + setattr( + StringViewAttrs, + f"resolve_{func}", + create_identifier_attr(func, udf_string), + ) @cuda_decl_registry.register_attr diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py index c03c52f4520..049dfdc8e30 100644 --- a/python/cudf/cudf/tests/test_string_udfs.py +++ b/python/cudf/cudf/tests/test_string_udfs.py @@ -26,9 +26,11 @@ def get_kernels(func, dtype, size): """ - Create a kernel for testing a single scalar string function + Create two kernels for testing a single scalar string function. + The first tests the function's action on a string_view object and + the second tests the same except using a udf_string object. Allocates an output vector with a dtype specified by the caller - The returned kernel executes the input function on each data + The returned kernels execute the input function on each data element of the input and returns the output into the output vector """ From 0840cdc118688599661480fdde2106d81e66e3cb Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 1 Mar 2023 07:24:19 -0800 Subject: [PATCH 14/14] add docs to sv_to_udf_str --- python/cudf/cudf/testing/_utils.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 6db201beb8a..24a0b69a48b 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -396,6 +396,21 @@ def expect_warning_if(condition, warning=FutureWarning, *args, **kwargs): def sv_to_udf_str(sv): + """ + Cast a string_view object to a udf_string object + + This placeholder function never runs in python + It exists only for numba to have something to replace + with the typing and lowering code below + + This is similar conceptually to needing a translation + engine to emit an expression in target language "B" when + there is no equivalent in the source language "A" to + translate from. This function effectively defines the + expression in language "A" and the associated typing + and lowering describe the translation process, despite + the expression having no meaning in language "A" + """ pass