From 663ca3a66f9ac40a95bc3d48fe5344d4d29cf22f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 30 Sep 2024 17:03:31 -0700 Subject: [PATCH 1/9] Start converting --- python/cudf/cudf/_lib/string_casting.pyx | 82 +++-------------- .../strings/convert/convert_booleans.pxd | 4 +- .../strings/convert/convert_datetime.pxd | 6 +- .../pylibcudf/strings/convert/CMakeLists.txt | 2 +- .../pylibcudf/strings/convert/__init__.pxd | 2 +- .../pylibcudf/strings/convert/__init__.py | 2 +- .../strings/convert/convert_booleans.pxd | 9 ++ .../strings/convert/convert_booleans.pyx | 92 +++++++++++++++++++ .../strings/convert/convert_datetime.pxd | 5 + .../strings/convert/convert_datetime.pyx | 2 +- .../tests/test_string_convert_booleans.py | 26 ++++++ 11 files changed, 152 insertions(+), 80 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 60a6795a402..900444732ee 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -15,10 +15,6 @@ from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.convert.convert_booleans cimport ( - from_booleans as cpp_from_booleans, - to_booleans as cpp_to_booleans, -) from pylibcudf.libcudf.strings.convert.convert_datetime cimport ( is_timestamp as cpp_is_timestamp, ) @@ -427,77 +423,21 @@ def stoul(Column input_col): return string_to_integer(input_col, cudf.dtype("uint64")) -def _to_booleans(Column input_col, object string_true="True"): - """ - Converting/Casting input column of type string to boolean column - - Parameters - ---------- - input_col : input column of type string - string_true : string that represents True - - Returns - ------- - A Column with string values cast to boolean - """ - - cdef DeviceScalar str_true = as_device_scalar(string_true) - cdef column_view input_column_view = input_col.view() - cdef const string_scalar* string_scalar_true = ( - str_true.get_raw_ptr()) - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_to_booleans( - input_column_view, - string_scalar_true[0])) - - return Column.from_unique_ptr(move(c_result)) - - def to_booleans(Column input_col): - - return _to_booleans(input_col) - - -def _from_booleans( - Column input_col, - object string_true="True", - object string_false="False"): - """ - Converting/Casting input column of type boolean to string column - - Parameters - ---------- - input_col : input column of type boolean - string_true : string that represents True - string_false : string that represents False - - Returns - ------- - A Column with boolean values cast to string - """ - - cdef DeviceScalar str_true = as_device_scalar(string_true) - cdef DeviceScalar str_false = as_device_scalar(string_false) - cdef column_view input_column_view = input_col.view() - cdef const string_scalar* string_scalar_true = ( - str_true.get_raw_ptr()) - cdef const string_scalar* string_scalar_false = ( - str_false.get_raw_ptr()) - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_booleans( - input_column_view, - string_scalar_true[0], - string_scalar_false[0])) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_booleans.to_booleans( + input_col.to_pylibcudf(mode="read"), + as_device_scalar("True").device_value.c_value, + ) + return Column.from_pylibcudf(plc_column) def from_booleans(Column input_col): - return _from_booleans(input_col) + plc_column = plc.strings.convert.convert_booleans.from_booleans( + input_col.to_pylibcudf(mode="read"), + as_device_scalar("True").device_value.c_value, + as_device_scalar("False").device_value.c_value, + ) + return Column.from_pylibcudf(plc_column) def int2timestamp( diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd index 83a9573baad..e6688cfff81 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd @@ -8,10 +8,10 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_booleans( - column_view input_col, + column_view input, string_scalar true_string) except + cdef unique_ptr[column] from_booleans( - column_view input_col, + column_view booleans, string_scalar true_string, string_scalar false_string) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd index fa8975c4df9..fceddd58df0 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd @@ -10,14 +10,14 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_timestamps( - column_view input_col, + column_view input, data_type timestamp_type, string format) except + cdef unique_ptr[column] from_timestamps( - column_view input_col, + column_view timestamps, string format, - column_view input_strings_names) except + + column_view names) except + cdef unique_ptr[column] is_timestamp( column_view input_col, diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index 175c9b3738e..3febc78dfd2 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources convert_durations.pyx convert_datetime.pyx) +set(cython_sources convert_booleans.pyx convert_durations.pyx convert_datetime.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index 05324cb49df..5525bca46d6 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -1,2 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport convert_datetime, convert_durations +from . cimport convert_booleans, convert_datetime, convert_durations diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index d803399d53c..2340ebe9a26 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -1,2 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import convert_datetime, convert_durations +from . import convert_booleans, convert_datetime, convert_durations diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd new file mode 100644 index 00000000000..312ac3c0ca0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar + + +cpdef Column to_booleans(Column input, Scalar true_string) + +cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx new file mode 100644 index 00000000000..995ae61c53a --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx @@ -0,0 +1,92 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings.convert cimport ( + convert_booleans as cpp_convert_booleans, +) +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference + + +cpdef Column to_booleans(Column input, Scalar true_string): + """ + Returns a new bool column by parsing boolean values from the strings + in the provided strings column. + + For details, see :cpp:func:`cudf::strings::to_booleans`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + true_string : Scalar + String to expect for true. Non-matching strings are false + + Returns + ------- + Column + New bool column converted from strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_true_string = ( + true_string.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_convert_booleans.to_booleans( + input.view(), + dereference(c_true_string) + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string): + """ + Returns a new strings column converting the boolean values from the + provided column into strings. + + For details, see :cpp:func:`cudf::strings::from_booleans`. + + Parameters + ---------- + booleans : Column + Boolean column to convert. + + true_string : Scalar + String to use for true in the output column. + + false_string : Scalar + String to use for false in the output column. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_true_string = ( + true_string.c_obj.get() + ) + cdef const string_scalar* c_false_string = ( + false_string.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_convert_booleans.from_booleans( + booleans.view(), + dereference(c_true_string), + dereference(c_false_string), + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd index 07c84d263d6..be284406c45 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd @@ -16,3 +16,8 @@ cpdef Column from_timestamps( const string& format, Column input_strings_names ) + +cpdef Column is_timestamp( + Column input, + const string& format, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx index fcacb096f87..d8a383cda6a 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx @@ -28,7 +28,7 @@ cpdef Column to_timestamps( return Column.from_libcudf(move(c_result)) cpdef Column from_timestamps( - Column input, + Column timestamps, const string& format, Column input_strings_names ): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py new file mode 100644 index 00000000000..117c59ff1b8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_to_booleans(): + pa_array = pa.array(["true", None, "True"]) + result = plc.strings.convert.convert_booleans.to_booleans( + plc.interop.from_arrow(pa_array), + plc.interop.from_arrow(pa.scalar("True")), + ) + expected = pa.array([False, None, True]) + assert_column_eq(result, expected) + + +def test_from_booleans(): + pa_array = pa.array([True, None, False]) + result = plc.strings.convert.convert_booleans.from_booleans( + plc.interop.from_arrow(pa_array), + plc.interop.from_arrow(pa.scalar("A")), + plc.interop.from_arrow(pa.scalar("B")), + ) + expected = pa.array(["A", None, "B"]) + assert_column_eq(result, expected) From 8b475da489bede3f90f399636946cfec688260b6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 1 Oct 2024 18:25:47 -0700 Subject: [PATCH 2/9] Finish convert_datetime --- python/cudf/cudf/_lib/string_casting.pyx | 28 ++----- .../groupby/test_ordering_pandas_compat.py | 29 ------- .../strings/convert/convert_datetime.pxd | 8 +- .../strings/convert/convert_datetime.pyx | 80 +++++++++++++++++-- .../tests/test_string_convert_datetime.py | 46 +++++++++++ 5 files changed, 130 insertions(+), 61 deletions(-) delete mode 100644 python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 900444732ee..f00915e6f86 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -3,9 +3,6 @@ from cudf._lib.column cimport Column from cudf._lib.scalar import as_device_scalar - -from cudf._lib.scalar cimport DeviceScalar - from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES from libcpp.memory cimport unique_ptr @@ -14,10 +11,6 @@ from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.convert.convert_datetime cimport ( - is_timestamp as cpp_is_timestamp, -) from pylibcudf.libcudf.strings.convert.convert_floats cimport ( from_floats as cpp_from_floats, to_floats as cpp_to_floats, @@ -460,11 +453,10 @@ def int2timestamp( A Column with date-time represented in string format """ - cdef string c_timestamp_format = format.encode("UTF-8") return Column.from_pylibcudf( plc.strings.convert.convert_datetime.from_timestamps( input_col.to_pylibcudf(mode="read"), - c_timestamp_format, + format, names.to_pylibcudf(mode="read") ) ) @@ -485,12 +477,11 @@ def timestamp2int(Column input_col, dtype, format): """ dtype = dtype_to_pylibcudf_type(dtype) - cdef string c_timestamp_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_datetime.to_timestamps( input_col.to_pylibcudf(mode="read"), dtype, - c_timestamp_format + format ) ) @@ -512,16 +503,11 @@ def istimestamp(Column input_col, str format): """ if input_col.size == 0: return cudf.core.column.column_empty(0, dtype=cudf.dtype("bool")) - cdef column_view input_column_view = input_col.view() - cdef string c_timestamp_format = str(format).encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_is_timestamp( - input_column_view, - c_timestamp_format)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_datetime.is_timestamp( + input_col.to_pylibcudf(mode="read"), + format + ) + return Column.from_pylibcudf(plc_column) def timedelta2int(Column input_col, dtype, format): diff --git a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py deleted file mode 100644 index a009802bab0..00000000000 --- a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. -import numpy as np -import pytest - -import cudf -from cudf.testing import assert_eq - - -@pytest.fixture(params=[False, True], ids=["without_nulls", "with_nulls"]) -def with_nulls(request): - return request.param - - -@pytest.mark.parametrize("nrows", [30, 300, 300_000]) -@pytest.mark.parametrize("nkeys", [1, 2, 4]) -def test_groupby_maintain_order_random(nrows, nkeys, with_nulls): - key_names = [f"key{key}" for key in range(nkeys)] - key_values = [np.random.randint(100, size=nrows) for _ in key_names] - value = np.random.randint(-100, 100, size=nrows) - df = cudf.DataFrame(dict(zip(key_names, key_values), value=value)) - if with_nulls: - for key in key_names: - df.loc[df[key] == 1, key] = None - with cudf.option_context("mode.pandas_compatible", True): - got = df.groupby(key_names, sort=False).agg({"value": "sum"}) - expect = ( - df.to_pandas().groupby(key_names, sort=False).agg({"value": "sum"}) - ) - assert_eq(expect, got, check_index_type=not with_nulls) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd index be284406c45..80ec168644b 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd @@ -8,16 +8,16 @@ from pylibcudf.types cimport DataType cpdef Column to_timestamps( Column input, DataType timestamp_type, - const string& format + str format ) cpdef Column from_timestamps( - Column input, - const string& format, + Column timestamps, + str format, Column input_strings_names ) cpdef Column is_timestamp( Column input, - const string& format, + str format, ) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx index d8a383cda6a..0ee60812e00 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx @@ -15,28 +15,74 @@ from pylibcudf.types import DataType cpdef Column to_timestamps( Column input, DataType timestamp_type, - const string& format + str format ): + """ + Returns a new timestamp column converting a strings column into + timestamps using the provided format pattern. + + For details, see cpp:`cudf::strings::to_timestamps`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + timestamp_type : DataType + The timestamp type used for creating the output column. + + format : str + String specifying the timestamp format in strings. + + Returns + ------- + Column + New datetime column + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.to_timestamps( input.view(), timestamp_type.c_obj, - format + c_format ) return Column.from_libcudf(move(c_result)) cpdef Column from_timestamps( Column timestamps, - const string& format, + str format, Column input_strings_names ): + """ + Returns a new strings column converting a timestamp column into + strings using the provided format pattern. + + For details, see cpp:`cudf::strings::from_timestamps`. + + Parameters + ---------- + timestamps : Column + Timestamp values to convert + + format : str + The string specifying output format. + + input_strings_names : Column + The string names to use for weekdays ("%a", "%A") and months ("%b", "%B"). + + Returns + ------- + Column + New strings column with formatted timestamps. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.from_timestamps( - input.view(), - format, + timestamps.view(), + c_format, input_strings_names.view() ) @@ -44,13 +90,33 @@ cpdef Column from_timestamps( cpdef Column is_timestamp( Column input, - const string& format + str format ): + """ + Verifies the given strings column can be parsed to timestamps + using the provided format pattern. + + For details, see cpp:`cudf::strings::is_timestamp`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + format : str + String specifying the timestamp format in strings. + + Returns + ------- + Column + New bool column. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.is_timestamp( input.view(), - format + c_format ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py new file mode 100644 index 00000000000..f3e84286a36 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py @@ -0,0 +1,46 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import datetime + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture +def fmt(): + return "%Y-%m-%dT%H:%M:%S" + + +def test_to_timestamp(fmt): + arr = pa.array(["2020-01-01T01:01:01", None]) + result = plc.strings.convert.convert_datetime.to_timestamps( + plc.interop.from_arrow(arr), + plc.DataType(plc.TypeId.TIMESTAMP_SECONDS), + fmt, + ) + expected = pc.strptime(arr, fmt, "s") + assert_column_eq(result, expected) + + +def test_from_timestamp(fmt): + arr = pa.array([datetime.datetime(2020, 1, 1, 1, 1, 1), None]) + result = plc.strings.convert.convert_datetime.from_timestamps( + plc.interop.from_arrow(arr), + fmt, + plc.interop.from_arrow(pa.array([], type=pa.string())), + ) + # pc.strftime will add the extra %f + expected = pa.array(["2020-01-01T01:01:01", None]) + assert_column_eq(result, expected) + + +def test_is_timestamp(fmt): + arr = pa.array(["2020-01-01T01:01:01", None, "2020-01-01"]) + result = plc.strings.convert.convert_datetime.is_timestamp( + plc.interop.from_arrow(arr), + fmt, + ) + expected = pa.array([True, None, False]) + assert_column_eq(result, expected) From 81b3bc9a88be9db09a4265ea20c9e72a7cd4ad78 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 1 Oct 2024 18:26:53 -0700 Subject: [PATCH 3/9] Unused import --- python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx index 995ae61c53a..0c10f821ab6 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx @@ -1,7 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr -from libcpp.string cimport string from libcpp.utility cimport move from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column From 3e393c564d636efc01fa6756b0c9e3ee4b1c05cc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 2 Oct 2024 10:15:25 -0700 Subject: [PATCH 4/9] Add back test file --- .../groupby/test_ordering_pandas_compat.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py diff --git a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py new file mode 100644 index 00000000000..a009802bab0 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.fixture(params=[False, True], ids=["without_nulls", "with_nulls"]) +def with_nulls(request): + return request.param + + +@pytest.mark.parametrize("nrows", [30, 300, 300_000]) +@pytest.mark.parametrize("nkeys", [1, 2, 4]) +def test_groupby_maintain_order_random(nrows, nkeys, with_nulls): + key_names = [f"key{key}" for key in range(nkeys)] + key_values = [np.random.randint(100, size=nrows) for _ in key_names] + value = np.random.randint(-100, 100, size=nrows) + df = cudf.DataFrame(dict(zip(key_names, key_values), value=value)) + if with_nulls: + for key in key_names: + df.loc[df[key] == 1, key] = None + with cudf.option_context("mode.pandas_compatible", True): + got = df.groupby(key_names, sort=False).agg({"value": "sum"}) + expect = ( + df.to_pandas().groupby(key_names, sort=False).agg({"value": "sum"}) + ) + assert_eq(expect, got, check_index_type=not with_nulls) From 88ab4b1015dda10b10047c8f691930b2d7fccfaf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 2 Oct 2024 10:25:06 -0700 Subject: [PATCH 5/9] Don't hardcode 12 in test_wrap --- python/pylibcudf/pylibcudf/tests/test_string_wrap.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py index 85abd3a2bae..a1c820cd586 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py @@ -7,6 +7,7 @@ def test_wrap(): + width = 12 pa_array = pa.array( [ "the quick brown fox jumped over the lazy brown dog", @@ -14,10 +15,10 @@ def test_wrap(): None, ] ) - result = plc.strings.wrap.wrap(plc.interop.from_arrow(pa_array), 12) + result = plc.strings.wrap.wrap(plc.interop.from_arrow(pa_array), width) expected = pa.array( [ - textwrap.fill(val, 12) if isinstance(val, str) else val + textwrap.fill(val, width) if isinstance(val, str) else val for val in pa_array.to_pylist() ] ) From c4c230f7bbfdf6ce3888bb5d0fed4db9cfe07951 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 2 Oct 2024 10:27:21 -0700 Subject: [PATCH 6/9] Adjust test --- python/pylibcudf/pylibcudf/tests/test_string_convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py index e9e95459d0e..22bb4971cb1 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py @@ -62,7 +62,7 @@ def test_to_datetime( got = plc.strings.convert.convert_datetime.to_timestamps( plc_timestamp_col, plc.interop.from_arrow(timestamp_type), - format.encode(), + format, ) assert_column_eq(expect, got) From 70513c02028ffb4ad0010236e3ccbb29323b03c2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 2 Oct 2024 12:25:23 -0700 Subject: [PATCH 7/9] Remove .device_value --- python/cudf/cudf/_lib/string_casting.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index f00915e6f86..55ff38f472d 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -419,7 +419,7 @@ def stoul(Column input_col): def to_booleans(Column input_col): plc_column = plc.strings.convert.convert_booleans.to_booleans( input_col.to_pylibcudf(mode="read"), - as_device_scalar("True").device_value.c_value, + as_device_scalar("True").c_value, ) return Column.from_pylibcudf(plc_column) @@ -427,8 +427,8 @@ def to_booleans(Column input_col): def from_booleans(Column input_col): plc_column = plc.strings.convert.convert_booleans.from_booleans( input_col.to_pylibcudf(mode="read"), - as_device_scalar("True").device_value.c_value, - as_device_scalar("False").device_value.c_value, + as_device_scalar("True").c_value, + as_device_scalar("False").c_value, ) return Column.from_pylibcudf(plc_column) From 1d28722d1f64a9c9aae902cdaabeac20623b929e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 2 Oct 2024 17:28:12 -0700 Subject: [PATCH 8/9] Fix polars usage --- python/cudf_polars/cudf_polars/dsl/expr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index c401e5a2f17..b600050c60d 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -937,7 +937,7 @@ def do_evaluate( ) return Column( plc.strings.convert.convert_datetime.to_timestamps( - res.columns()[0], self.dtype, format.encode() + res.columns()[0], self.dtype, format ) ) elif self.name == pl_expr.StringFunction.Replace: From 756accc6cb74040370244efe9894abf2f704e62a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 3 Oct 2024 09:26:01 -0700 Subject: [PATCH 9/9] Remove another format.encode --- python/cudf_polars/cudf_polars/dsl/expr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index b600050c60d..54476b7fedc 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -914,7 +914,7 @@ def do_evaluate( col = self.children[0].evaluate(df, context=context, mapping=mapping) is_timestamps = plc.strings.convert.convert_datetime.is_timestamp( - col.obj, format.encode() + col.obj, format ) if strict: