From 3d0dc8abdbb5dd8d0e73ab542fcffefec95eb7bc Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 14 Oct 2024 06:56:29 -0700 Subject: [PATCH 01/19] fix --- .../cudf_polars/dsl/expressions/unary.py | 14 +++++++++- .../tests/expressions/test_stringfunction.py | 27 +++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index 3d4d15be1ce..6614308b5d7 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -33,7 +33,19 @@ class Cast(Expr): def __init__(self, dtype: plc.DataType, value: Expr) -> None: super().__init__(dtype) self.children = (value,) - if not dtypes.can_cast(value.dtype, self.dtype): + if ( + self.dtype.id() == plc.TypeId.STRING + or value.dtype.id() == plc.TypeId.STRING + ): + if not ( + self.dtype.id() == plc.TypeId.STRING + and value.dtype.id() in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64} + ) or ( + self.dtype.id() in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64} + and value.dtype.id() == plc.TypeId.STRING + ): + raise NotImplementedError("Only string to float cast is supported") + elif not dtypes.can_cast(value.dtype, self.dtype): raise NotImplementedError( f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}" ) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 4f6850ac977..adf89f53d8d 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -40,6 +40,28 @@ def ldf(with_nulls): ) +@pytest.fixture +def str_to_float_data(with_nulls): + a = [ + "1.1", + "2.2", + "3.3", + "4.4", + "5.5", + "6.6", + "7.7", + "8.8", + "9.9", + "10.10", + "11.11", + "12.12", + ] + if with_nulls: + a[4] = None + a[-3] = None + return pl.LazyFrame({"a": a}) + + slice_cases = [ (1, 3), (0, 3), @@ -337,3 +359,8 @@ def test_unsupported_regex_raises(pattern): q = df.select(pl.col("a").str.contains(pattern, strict=True)) assert_ir_translation_raises(q, NotImplementedError) + + +def test_string_to_float(str_to_float_data): + query = str_to_float_data.select(pl.col("a").cast(pl.Float64)) + assert_gpu_result_equal(query) From 59ceb03f297008a5246914266f7f4a83ccd7dd71 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 14 Oct 2024 07:04:54 -0700 Subject: [PATCH 02/19] passing --- .../cudf_polars/dsl/expressions/unary.py | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index 6614308b5d7..a7bdca631a1 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -38,11 +38,14 @@ def __init__(self, dtype: plc.DataType, value: Expr) -> None: or value.dtype.id() == plc.TypeId.STRING ): if not ( - self.dtype.id() == plc.TypeId.STRING - and value.dtype.id() in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64} - ) or ( - self.dtype.id() in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64} - and value.dtype.id() == plc.TypeId.STRING + ( + self.dtype.id() == plc.TypeId.STRING + and value.dtype.id() in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64} + ) + or ( + self.dtype.id() in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64} + and value.dtype.id() == plc.TypeId.STRING + ) ): raise NotImplementedError("Only string to float cast is supported") elif not dtypes.can_cast(value.dtype, self.dtype): @@ -60,7 +63,16 @@ def do_evaluate( """Evaluate this expression given a dataframe for context.""" (child,) = self.children column = child.evaluate(df, context=context, mapping=mapping) - return Column(plc.unary.cast(column.obj, self.dtype)).sorted_like(column) + if ( + self.dtype.id() == plc.TypeId.STRING + or column.obj.type().id() == plc.TypeId.STRING + ): + result = plc.strings.convert.convert_floats.to_floats( + column.obj, self.dtype + ) + else: + result = plc.unary.cast(column.obj, self.dtype) + return Column(result).sorted_like(column) def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" From be0fae9bdc34beb6fd2b8f984b8b64f07b9a2820 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 14 Oct 2024 07:44:12 -0700 Subject: [PATCH 03/19] more tests, needs refactor --- .../cudf_polars/dsl/expressions/unary.py | 31 +++++-- .../tests/expressions/test_stringfunction.py | 87 ++++++++++++++++++- 2 files changed, 108 insertions(+), 10 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index a7bdca631a1..8cf28801986 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -9,6 +9,7 @@ import pyarrow as pa import pylibcudf as plc +from pylibcudf.traits import is_floating_point, is_integral_not_bool from cudf_polars.containers import Column from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr @@ -23,6 +24,10 @@ __all__ = ["Cast", "UnaryFunction", "Len"] +def _is_int_or_float(dtype: plc.DataType) -> bool: + return is_integral_not_bool(dtype) or is_floating_point(dtype) + + class Cast(Expr): """Class representing a cast of an expression.""" @@ -38,12 +43,9 @@ def __init__(self, dtype: plc.DataType, value: Expr) -> None: or value.dtype.id() == plc.TypeId.STRING ): if not ( - ( - self.dtype.id() == plc.TypeId.STRING - and value.dtype.id() in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64} - ) + (self.dtype.id() == plc.TypeId.STRING and _is_int_or_float(value.dtype)) or ( - self.dtype.id() in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64} + _is_int_or_float(self.dtype) and value.dtype.id() == plc.TypeId.STRING ) ): @@ -67,9 +69,22 @@ def do_evaluate( self.dtype.id() == plc.TypeId.STRING or column.obj.type().id() == plc.TypeId.STRING ): - result = plc.strings.convert.convert_floats.to_floats( - column.obj, self.dtype - ) + if self.dtype.id() == plc.TypeId.STRING: + if is_floating_point(column.obj.type()): + result = plc.strings.convert.convert_floats.from_floats(column.obj) + else: + result = plc.strings.convert.convert_integers.from_integers( + column.obj + ) + else: + if is_floating_point(self.dtype): + result = plc.strings.convert.convert_floats.to_floats( + column.obj, self.dtype + ) + else: + result = plc.strings.convert.convert_integers.to_integers( + column.obj, self.dtype + ) else: result = plc.unary.cast(column.obj, self.dtype) return Column(result).sorted_like(column) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index adf89f53d8d..f6c64ec02a7 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -62,6 +62,72 @@ def str_to_float_data(with_nulls): return pl.LazyFrame({"a": a}) +@pytest.fixture +def str_to_integer_data(with_nulls): + a = [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + ] + if with_nulls: + a[4] = None + a[-3] = None + return pl.LazyFrame({"a": a}) + + +@pytest.fixture +def str_from_float_data(with_nulls): + a = [ + 1.1, + 2.2, + 3.3, + 4.4, + 5.5, + 6.6, + 7.7, + 8.8, + 9.9, + 10.10, + 11.11, + 12.12, + ] + if with_nulls: + a[4] = None + a[-3] = None + return pl.LazyFrame({"a": a}) + + +@pytest.fixture +def str_from_integer_data(with_nulls): + a = [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + ] + if with_nulls: + a[4] = None + a[-3] = None + return pl.LazyFrame({"a": a}) + + slice_cases = [ (1, 3), (0, 3), @@ -361,6 +427,23 @@ def test_unsupported_regex_raises(pattern): assert_ir_translation_raises(q, NotImplementedError) -def test_string_to_float(str_to_float_data): - query = str_to_float_data.select(pl.col("a").cast(pl.Float64)) +@pytest.mark.parametrize("dtype", [pl.Float32, pl.Float64]) +def test_string_to_float(str_to_float_data, dtype): + query = str_to_float_data.select(pl.col("a").cast(dtype)) + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize("dtype", [pl.Int8, pl.Int16, pl.Int32, pl.Int64]) +def test_string_to_integer(str_to_integer_data, dtype): + query = str_to_integer_data.select(pl.col("a").cast(dtype)) + assert_gpu_result_equal(query) + + +def test_string_from_float(str_from_float_data): + query = str_from_float_data.select(pl.col("a").cast(pl.String)) + assert_gpu_result_equal(query) + + +def test_string_from_integer(str_from_integer_data): + query = str_from_integer_data.select(pl.col("a").cast(pl.String)) assert_gpu_result_equal(query) From 209b9060baf20c5dca3613b988b390aa9e577df4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 14 Oct 2024 08:58:35 -0700 Subject: [PATCH 04/19] refactor --- .../tests/expressions/test_stringfunction.py | 94 +++---------------- 1 file changed, 11 insertions(+), 83 deletions(-) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index f6c64ec02a7..bf59dab08ab 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -40,74 +40,21 @@ def ldf(with_nulls): ) -@pytest.fixture -def str_to_float_data(with_nulls): - a = [ - "1.1", - "2.2", - "3.3", - "4.4", - "5.5", - "6.6", - "7.7", - "8.8", - "9.9", - "10.10", - "11.11", - "12.12", - ] - if with_nulls: - a[4] = None - a[-3] = None - return pl.LazyFrame({"a": a}) - - -@pytest.fixture -def str_to_integer_data(with_nulls): - a = [ - "1", - "2", - "3", - "4", - "5", - "6", - "7", - "8", - "9", - "10", - "11", - "12", - ] - if with_nulls: - a[4] = None - a[-3] = None - return pl.LazyFrame({"a": a}) +@pytest.fixture(params=[pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64]) +def numeric_type(request): + return request.param @pytest.fixture -def str_from_float_data(with_nulls): - a = [ - 1.1, - 2.2, - 3.3, - 4.4, - 5.5, - 6.6, - 7.7, - 8.8, - 9.9, - 10.10, - 11.11, - 12.12, - ] +def str_to_numeric_data(with_nulls): + a = ["1", "2", "3", "4", "5", "6"] if with_nulls: a[4] = None - a[-3] = None return pl.LazyFrame({"a": a}) @pytest.fixture -def str_from_integer_data(with_nulls): +def str_from_numeric_data(with_nulls, numeric_type): a = [ 1, 2, @@ -115,17 +62,10 @@ def str_from_integer_data(with_nulls): 4, 5, 6, - 7, - 8, - 9, - 10, - 11, - 12, ] if with_nulls: a[4] = None - a[-3] = None - return pl.LazyFrame({"a": a}) + return pl.LazyFrame({"a": pl.Series(a, dtype=numeric_type)}) slice_cases = [ @@ -427,23 +367,11 @@ def test_unsupported_regex_raises(pattern): assert_ir_translation_raises(q, NotImplementedError) -@pytest.mark.parametrize("dtype", [pl.Float32, pl.Float64]) -def test_string_to_float(str_to_float_data, dtype): - query = str_to_float_data.select(pl.col("a").cast(dtype)) - assert_gpu_result_equal(query) - - -@pytest.mark.parametrize("dtype", [pl.Int8, pl.Int16, pl.Int32, pl.Int64]) -def test_string_to_integer(str_to_integer_data, dtype): - query = str_to_integer_data.select(pl.col("a").cast(dtype)) - assert_gpu_result_equal(query) - - -def test_string_from_float(str_from_float_data): - query = str_from_float_data.select(pl.col("a").cast(pl.String)) +def test_string_to_numeric(str_to_numeric_data, numeric_type): + query = str_to_numeric_data.select(pl.col("a").cast(numeric_type)) assert_gpu_result_equal(query) -def test_string_from_integer(str_from_integer_data): - query = str_from_integer_data.select(pl.col("a").cast(pl.String)) +def test_string_from_numeric(str_from_numeric_data): + query = str_from_numeric_data.select(pl.col("a").cast(pl.String)) assert_gpu_result_equal(query) From c9199ec01359fcd97efbf506f6ac73e93fc504de Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 15 Oct 2024 11:35:30 -0700 Subject: [PATCH 05/19] address reviews --- .../cudf_polars/dsl/expressions/unary.py | 56 +++++++------------ .../cudf_polars/cudf_polars/utils/dtypes.py | 15 ++++- 2 files changed, 33 insertions(+), 38 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index 8cf28801986..c9e52e5056f 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -9,7 +9,7 @@ import pyarrow as pa import pylibcudf as plc -from pylibcudf.traits import is_floating_point, is_integral_not_bool +from pylibcudf.traits import is_floating_point from cudf_polars.containers import Column from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr @@ -24,10 +24,6 @@ __all__ = ["Cast", "UnaryFunction", "Len"] -def _is_int_or_float(dtype: plc.DataType) -> bool: - return is_integral_not_bool(dtype) or is_floating_point(dtype) - - class Cast(Expr): """Class representing a cast of an expression.""" @@ -38,21 +34,9 @@ class Cast(Expr): def __init__(self, dtype: plc.DataType, value: Expr) -> None: super().__init__(dtype) self.children = (value,) - if ( - self.dtype.id() == plc.TypeId.STRING - or value.dtype.id() == plc.TypeId.STRING - ): - if not ( - (self.dtype.id() == plc.TypeId.STRING and _is_int_or_float(value.dtype)) - or ( - _is_int_or_float(self.dtype) - and value.dtype.id() == plc.TypeId.STRING - ) - ): - raise NotImplementedError("Only string to float cast is supported") - elif not dtypes.can_cast(value.dtype, self.dtype): + if not dtypes.can_cast(value.dtype, self.dtype): raise NotImplementedError( - f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}" + f"Can't cast {value.dtype.id().name} to {self.dtype.id().name}" ) def do_evaluate( @@ -69,26 +53,28 @@ def do_evaluate( self.dtype.id() == plc.TypeId.STRING or column.obj.type().id() == plc.TypeId.STRING ): - if self.dtype.id() == plc.TypeId.STRING: - if is_floating_point(column.obj.type()): - result = plc.strings.convert.convert_floats.from_floats(column.obj) - else: - result = plc.strings.convert.convert_integers.from_integers( - column.obj - ) - else: - if is_floating_point(self.dtype): - result = plc.strings.convert.convert_floats.to_floats( - column.obj, self.dtype - ) - else: - result = plc.strings.convert.convert_integers.to_integers( - column.obj, self.dtype - ) + result = self._handle_string_cast(column) else: result = plc.unary.cast(column.obj, self.dtype) return Column(result).sorted_like(column) + def _handle_string_cast(self, column: Column) -> plc.Column: + if self.dtype.id() == plc.TypeId.STRING: + if is_floating_point(column.obj.type()): + result = plc.strings.convert.convert_floats.from_floats(column.obj) + else: + result = plc.strings.convert.convert_integers.from_integers(column.obj) + else: + if is_floating_point(self.dtype): + result = plc.strings.convert.convert_floats.to_floats( + column.obj, self.dtype + ) + else: + result = plc.strings.convert.convert_integers.to_integers( + column.obj, self.dtype + ) + return result + def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" # TODO: Could do with sort-based groupby and segmented filter diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 4154a404e98..fb7ed0aaf2b 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -9,6 +9,7 @@ import pyarrow as pa import pylibcudf as plc +from pylibcudf.traits import is_floating_point, is_integral_not_bool from typing_extensions import assert_never import polars as pl @@ -45,6 +46,10 @@ def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType: return typ +def _is_int_or_float(dtype: plc.DataType) -> bool: + return is_integral_not_bool(dtype) or is_floating_point(dtype) + + def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: """ Can we cast (via :func:`~.pylibcudf.unary.cast`) between two datatypes. @@ -61,9 +66,13 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: True if casting is supported, False otherwise """ return ( - plc.traits.is_fixed_width(to) - and plc.traits.is_fixed_width(from_) - and plc.unary.is_supported_cast(from_, to) + ( + plc.traits.is_fixed_width(to) + and plc.traits.is_fixed_width(from_) + and plc.unary.is_supported_cast(from_, to) + ) + or (from_.id() == plc.TypeId.STRING and _is_int_or_float(to)) + or (_is_int_or_float(from_) and to.id() == plc.TypeId.STRING) ) From 7feb1f39668319d71daecd990af4cde11408028e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 16 Oct 2024 11:10:27 -0700 Subject: [PATCH 06/19] update tests --- python/cudf_polars/tests/expressions/test_casting.py | 2 +- .../tests/expressions/test_numeric_binops.py | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/python/cudf_polars/tests/expressions/test_casting.py b/python/cudf_polars/tests/expressions/test_casting.py index 3e003054338..0722a0f198a 100644 --- a/python/cudf_polars/tests/expressions/test_casting.py +++ b/python/cudf_polars/tests/expressions/test_casting.py @@ -14,7 +14,7 @@ _supported_dtypes = [(pl.Int8(), pl.Int64())] _unsupported_dtypes = [ - (pl.String(), pl.Int64()), + (pl.Datetime("ns"), pl.Int64()), ] diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py index 8f68bbc460c..fa1ec3c19e4 100644 --- a/python/cudf_polars/tests/expressions/test_numeric_binops.py +++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py @@ -8,7 +8,6 @@ from cudf_polars.testing.asserts import ( assert_gpu_result_equal, - assert_ir_translation_raises, ) dtypes = [ @@ -114,12 +113,3 @@ def test_binop_with_scalar(left_scalar, right_scalar): q = df.select(lop / rop) assert_gpu_result_equal(q) - - -def test_numeric_to_string_cast_fails(): - df = pl.DataFrame( - {"a": [1, 1, 2, 3, 3, 4, 1], "b": [None, 2, 3, 4, 5, 6, 7]} - ).lazy() - q = df.select(pl.col("a").cast(pl.String)) - - assert_ir_translation_raises(q, NotImplementedError) From 6d699ac5d09366e99e9bb3b7893c636e1216b033 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 16 Oct 2024 14:40:28 -0700 Subject: [PATCH 07/19] handle runtime conversion failure --- .../cudf_polars/dsl/expressions/unary.py | 38 +++++++++++++++---- .../tests/expressions/test_stringfunction.py | 10 +++++ 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index c9e52e5056f..e6a93e3f656 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -9,8 +9,16 @@ import pyarrow as pa import pylibcudf as plc +from pylibcudf.strings.convert.convert_floats import from_floats, is_float, to_floats +from pylibcudf.strings.convert.convert_integers import ( + from_integers, + is_integer, + to_integers, +) from pylibcudf.traits import is_floating_point +from polars.exceptions import InvalidOperationError + from cudf_polars.containers import Column from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr from cudf_polars.dsl.expressions.literal import Literal @@ -61,18 +69,32 @@ def do_evaluate( def _handle_string_cast(self, column: Column) -> plc.Column: if self.dtype.id() == plc.TypeId.STRING: if is_floating_point(column.obj.type()): - result = plc.strings.convert.convert_floats.from_floats(column.obj) + result = from_floats(column.obj) else: - result = plc.strings.convert.convert_integers.from_integers(column.obj) + result = from_integers(column.obj) else: if is_floating_point(self.dtype): - result = plc.strings.convert.convert_floats.to_floats( - column.obj, self.dtype - ) + floats = is_float(column.obj) + if not plc.interop.to_arrow( + plc.reduce.reduce( + floats, + plc.aggregation.all(), + plc.DataType(plc.TypeId.BOOL8), + ) + ).as_py(): + raise InvalidOperationError("Conversion from `str` failed.") + result = to_floats(column.obj, self.dtype) else: - result = plc.strings.convert.convert_integers.to_integers( - column.obj, self.dtype - ) + integers = is_integer(column.obj) + if not plc.interop.to_arrow( + plc.reduce.reduce( + integers, + plc.aggregation.all(), + plc.DataType(plc.TypeId.BOOL8), + ) + ).as_py(): + raise InvalidOperationError("Conversion from `str` failed.") + result = to_integers(column.obj, self.dtype) return result def collect_agg(self, *, depth: int) -> AggInfo: diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index bf59dab08ab..d1655e5e466 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -375,3 +375,13 @@ def test_string_to_numeric(str_to_numeric_data, numeric_type): def test_string_from_numeric(str_from_numeric_data): query = str_from_numeric_data.select(pl.col("a").cast(pl.String)) assert_gpu_result_equal(query) + + +def test_string_to_numeric_invalid(numeric_type): + df = pl.LazyFrame({"a": ["a", "b", "c"]}) + q = df.select(pl.col("a").cast(numeric_type)) + assert_collect_raises( + q, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=pl.exceptions.ComputeError, + ) From 735c9e319650243708eb660d62adca9cccec997f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 24 Oct 2024 13:46:11 -0700 Subject: [PATCH 08/19] moving things --- .../cudf_polars/containers/column.py | 52 +++++++++++++++++-- .../cudf_polars/dsl/expressions/unary.py | 49 +---------------- 2 files changed, 48 insertions(+), 53 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 00186098e54..a30604abddf 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -9,6 +9,15 @@ from typing import TYPE_CHECKING import pylibcudf as plc +from pylibcudf.strings.convert.convert_floats import from_floats, is_float, to_floats +from pylibcudf.strings.convert.convert_integers import ( + from_integers, + is_integer, + to_integers, +) +from pylibcudf.traits import is_floating_point + +from polars.exceptions import InvalidOperationError if TYPE_CHECKING: from typing_extensions import Self @@ -129,11 +138,44 @@ def astype(self, dtype: plc.DataType) -> Column: This only produces a copy if the requested dtype doesn't match the current one. """ - if self.obj.type() != dtype: - return Column(plc.unary.cast(self.obj, dtype), name=self.name).sorted_like( - self - ) - return self + if self.obj.type() == dtype: + return self + + if dtype.id() == plc.TypeId.STRING or self.obj.type().id() == plc.TypeId.STRING: + result = self._handle_string_cast(dtype) + else: + result = plc.unary.cast(self.obj, dtype) + return Column(result).sorted_like(self) + + def _handle_string_cast(self, dtype: plc.DataType) -> plc.Column: + if dtype.id() == plc.TypeId.STRING: + if is_floating_point(self.obj.type()): + return from_floats(self.obj) + else: + return from_integers(self.obj) + else: + if is_floating_point(dtype): + floats = is_float(self.obj) + if not plc.interop.to_arrow( + plc.reduce.reduce( + floats, + plc.aggregation.all(), + plc.DataType(plc.TypeId.BOOL8), + ) + ).as_py(): + raise InvalidOperationError("Conversion from `str` failed.") + return to_floats(self.obj, dtype) + else: + integers = is_integer(self.obj) + if not plc.interop.to_arrow( + plc.reduce.reduce( + integers, + plc.aggregation.all(), + plc.DataType(plc.TypeId.BOOL8), + ) + ).as_py(): + raise InvalidOperationError("Conversion from `str` failed.") + return to_integers(self.obj, dtype) def copy_metadata(self, from_: pl.Series, /) -> Self: """ diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index 2e0ab12eeb3..5d9f50a916d 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -9,15 +9,6 @@ import pyarrow as pa import pylibcudf as plc -from pylibcudf.strings.convert.convert_floats import from_floats, is_float, to_floats -from pylibcudf.strings.convert.convert_integers import ( - from_integers, - is_integer, - to_integers, -) -from pylibcudf.traits import is_floating_point - -from polars.exceptions import InvalidOperationError from cudf_polars.containers import Column from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr @@ -56,45 +47,7 @@ def do_evaluate( """Evaluate this expression given a dataframe for context.""" (child,) = self.children column = child.evaluate(df, context=context, mapping=mapping) - if ( - self.dtype.id() == plc.TypeId.STRING - or column.obj.type().id() == plc.TypeId.STRING - ): - result = self._handle_string_cast(column) - else: - result = plc.unary.cast(column.obj, self.dtype) - return Column(result).sorted_like(column) - - def _handle_string_cast(self, column: Column) -> plc.Column: - if self.dtype.id() == plc.TypeId.STRING: - if is_floating_point(column.obj.type()): - result = from_floats(column.obj) - else: - result = from_integers(column.obj) - else: - if is_floating_point(self.dtype): - floats = is_float(column.obj) - if not plc.interop.to_arrow( - plc.reduce.reduce( - floats, - plc.aggregation.all(), - plc.DataType(plc.TypeId.BOOL8), - ) - ).as_py(): - raise InvalidOperationError("Conversion from `str` failed.") - result = to_floats(column.obj, self.dtype) - else: - integers = is_integer(column.obj) - if not plc.interop.to_arrow( - plc.reduce.reduce( - integers, - plc.aggregation.all(), - plc.DataType(plc.TypeId.BOOL8), - ) - ).as_py(): - raise InvalidOperationError("Conversion from `str` failed.") - result = to_integers(column.obj, self.dtype) - return result + return column.astype(self.dtype) def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" From 9f2cc181f0b24b3a0c6622cb7e1e63f09c90c182 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 25 Oct 2024 09:18:28 -0700 Subject: [PATCH 09/19] implement and use is_numeric_not_bool --- cpp/include/cudf/utilities/traits.hpp | 24 +++++++++++++++++++ cpp/src/utilities/traits.cpp | 13 ++++++++++ .../cudf_polars/containers/column.py | 5 ++-- .../cudf_polars/cudf_polars/utils/dtypes.py | 10 +++----- .../pylibcudf/libcudf/utilities/traits.pxd | 1 + .../pylibcudf/pylibcudf/tests/test_traits.py | 5 ++++ python/pylibcudf/pylibcudf/traits.pyx | 6 +++++ 7 files changed, 54 insertions(+), 10 deletions(-) diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index 3f37ae02151..c37fa460ca1 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -303,6 +303,30 @@ constexpr inline bool is_integral_not_bool() */ bool is_integral_not_bool(data_type type); +/** + * @brief Indicates whether the type `T` is a numeric type but not bool type. + * + * @tparam T The type to verify + * @return true `T` is numeric but not bool + * @return false `T` is not numeric or is bool + */ +template +constexpr inline bool is_numeric_not_bool() +{ + return cudf::is_numeric() and not std::is_same_v; +} + +/** + * @brief Indicates whether `type` is a numeric `data_type` but not BOOL8 + * + * "Numeric" types are integral/floating point types such as `INT*` or `FLOAT*`. + * + * @param type The `data_type` to verify + * @return true `type` is numeric but not bool + * @return false `type` is not numeric or is bool + */ +bool is_numeric_not_bool(data_type type); + /** * @brief Indicates whether the type `T` is a floating point type. * diff --git a/cpp/src/utilities/traits.cpp b/cpp/src/utilities/traits.cpp index a68dc84e340..d65e46e3c80 100644 --- a/cpp/src/utilities/traits.cpp +++ b/cpp/src/utilities/traits.cpp @@ -171,6 +171,19 @@ bool is_integral_not_bool(data_type type) return cudf::type_dispatcher(type, is_integral_not_bool_impl{}); } +struct is_numeric_not_bool_impl { + template + constexpr bool operator()() + { + return is_numeric_not_bool(); + } +}; + +bool is_numeric_not_bool(data_type type) +{ + return cudf::type_dispatcher(type, is_numeric_not_bool_impl{}); +} + struct is_floating_point_impl { template constexpr bool operator()() diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index a30604abddf..e04c7ecf07f 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -142,10 +142,9 @@ def astype(self, dtype: plc.DataType) -> Column: return self if dtype.id() == plc.TypeId.STRING or self.obj.type().id() == plc.TypeId.STRING: - result = self._handle_string_cast(dtype) + return Column(self._handle_string_cast(dtype)) else: - result = plc.unary.cast(self.obj, dtype) - return Column(result).sorted_like(self) + return Column(plc.unary.cast(self.obj, dtype)).sorted_like(self) def _handle_string_cast(self, dtype: plc.DataType) -> plc.Column: if dtype.id() == plc.TypeId.STRING: diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index fb7ed0aaf2b..52f4951dabc 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -9,7 +9,7 @@ import pyarrow as pa import pylibcudf as plc -from pylibcudf.traits import is_floating_point, is_integral_not_bool +from pylibcudf.traits import is_numeric_not_bool from typing_extensions import assert_never import polars as pl @@ -46,10 +46,6 @@ def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType: return typ -def _is_int_or_float(dtype: plc.DataType) -> bool: - return is_integral_not_bool(dtype) or is_floating_point(dtype) - - def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: """ Can we cast (via :func:`~.pylibcudf.unary.cast`) between two datatypes. @@ -71,8 +67,8 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: and plc.traits.is_fixed_width(from_) and plc.unary.is_supported_cast(from_, to) ) - or (from_.id() == plc.TypeId.STRING and _is_int_or_float(to)) - or (_is_int_or_float(from_) and to.id() == plc.TypeId.STRING) + or (from_.id() == plc.TypeId.STRING and is_numeric_not_bool(to)) + or (is_numeric_not_bool(from_) and to.id() == plc.TypeId.STRING) ) diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd index 69765e44274..5533530754e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd @@ -9,6 +9,7 @@ cdef extern from "cudf/utilities/traits.hpp" namespace "cudf" nogil: cdef bool is_relationally_comparable(data_type) cdef bool is_equality_comparable(data_type) cdef bool is_numeric(data_type) + cdef bool is_numeric_not_bool(data_type) cdef bool is_index_type(data_type) cdef bool is_unsigned(data_type) cdef bool is_integral(data_type) diff --git a/python/pylibcudf/pylibcudf/tests/test_traits.py b/python/pylibcudf/pylibcudf/tests/test_traits.py index 2570e8abd51..2c1708304eb 100644 --- a/python/pylibcudf/pylibcudf/tests/test_traits.py +++ b/python/pylibcudf/pylibcudf/tests/test_traits.py @@ -20,6 +20,11 @@ def test_is_numeric(): assert not plc.traits.is_numeric(plc.DataType(plc.TypeId.LIST)) +def test_is_numeric_not_bool(): + assert plc.traits.is_numeric_not_bool(plc.DataType(plc.TypeId.FLOAT64)) + assert not plc.traits.is_numeric_not_bool(plc.DataType(plc.TypeId.BOOL8)) + + def test_is_index_type(): assert plc.traits.is_index_type(plc.DataType(plc.TypeId.INT8)) assert not plc.traits.is_index_type(plc.DataType(plc.TypeId.BOOL8)) diff --git a/python/pylibcudf/pylibcudf/traits.pyx b/python/pylibcudf/pylibcudf/traits.pyx index 5a1c67e1f6c..9c52e0ac1ab 100644 --- a/python/pylibcudf/pylibcudf/traits.pyx +++ b/python/pylibcudf/pylibcudf/traits.pyx @@ -29,6 +29,12 @@ cpdef bool is_numeric(DataType typ): """ return traits.is_numeric(typ.c_obj) +cpdef bool is_numeric_not_bool(DataType typ): + """Checks if the given data type is numeric excluding booleans. + + For details, see :cpp:func:`is_numeric_not_bool`. + """ + return traits.is_numeric_not_bool(typ.c_obj) cpdef bool is_index_type(DataType typ): """Checks if the given data type is an index type. From 9ecac4142b7821efba6dac127d30fc46829a620a Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 25 Oct 2024 10:03:18 -0700 Subject: [PATCH 10/19] update tests --- .../cudf_polars/cudf_polars/utils/dtypes.py | 2 +- .../tests/expressions/test_stringfunction.py | 76 +++++++++++++++++-- 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 52f4951dabc..491d57584b6 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -68,7 +68,7 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: and plc.unary.is_supported_cast(from_, to) ) or (from_.id() == plc.TypeId.STRING and is_numeric_not_bool(to)) - or (is_numeric_not_bool(from_) and to.id() == plc.TypeId.STRING) + or (to.id() == plc.TypeId.STRING and is_numeric_not_bool(from_)) ) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index d1655e5e466..641608ffb0e 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -40,13 +40,23 @@ def ldf(with_nulls): ) -@pytest.fixture(params=[pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64]) +@pytest.fixture(params=[pl.Int8, pl.Int16, pl.Int32, pl.Int64]) +def integer_type(request): + return request.param + + +@pytest.fixture(params=[pl.Float32, pl.Float64]) +def floating_type(request): + return request.param + + +@pytest.fixture(params=[pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.Float32, pl.Float64]) def numeric_type(request): return request.param @pytest.fixture -def str_to_numeric_data(with_nulls): +def str_to_integer_data(with_nulls): a = ["1", "2", "3", "4", "5", "6"] if with_nulls: a[4] = None @@ -54,7 +64,28 @@ def str_to_numeric_data(with_nulls): @pytest.fixture -def str_from_numeric_data(with_nulls, numeric_type): +def str_to_float_data(with_nulls): + a = [ + "1.1", + "2.2", + "3.3", + "4.4", + "5.5", + "6.6", + "inf", + "+inf", + "-inf", + "nan", + "-1.234", + "2e2", + ] + if with_nulls: + a[4] = None + return pl.LazyFrame({"a": a}) + + +@pytest.fixture +def str_from_integer_data(with_nulls, integer_type): a = [ 1, 2, @@ -65,7 +96,26 @@ def str_from_numeric_data(with_nulls, numeric_type): ] if with_nulls: a[4] = None - return pl.LazyFrame({"a": pl.Series(a, dtype=numeric_type)}) + return pl.LazyFrame({"a": pl.Series(a, dtype=integer_type)}) + + +@pytest.fixture +def str_from_float_data(with_nulls, floating_type): + a = [ + 1.1, + 2.2, + 3.3, + 4.4, + 5.5, + 6.6, + float("inf"), + float("+inf"), + float("-inf"), + float("nan"), + ] + if with_nulls: + a[4] = None + return pl.LazyFrame({"a": pl.Series(a, dtype=floating_type)}) slice_cases = [ @@ -367,13 +417,23 @@ def test_unsupported_regex_raises(pattern): assert_ir_translation_raises(q, NotImplementedError) -def test_string_to_numeric(str_to_numeric_data, numeric_type): - query = str_to_numeric_data.select(pl.col("a").cast(numeric_type)) +def test_string_to_integer(str_to_integer_data, integer_type): + query = str_to_integer_data.select(pl.col("a").cast(integer_type)) + assert_gpu_result_equal(query) + + +def test_string_from_integer(str_from_integer_data): + query = str_from_integer_data.select(pl.col("a").cast(pl.String)) + assert_gpu_result_equal(query) + + +def test_string_to_float(str_to_float_data, floating_type): + query = str_to_float_data.select(pl.col("a").cast(floating_type)) assert_gpu_result_equal(query) -def test_string_from_numeric(str_from_numeric_data): - query = str_from_numeric_data.select(pl.col("a").cast(pl.String)) +def test_string_from_float(str_from_float_data): + query = str_from_float_data.select(pl.col("a").cast(pl.String)) assert_gpu_result_equal(query) From cd800833f443dac342eecabf9bd339ca9f279ad4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 28 Oct 2024 07:41:57 -0700 Subject: [PATCH 11/19] test, implement, and use is_order_preserving_cast --- .../cudf_polars/containers/column.py | 7 ++- .../cudf_polars/cudf_polars/utils/dtypes.py | 54 ++++++++++++++++++- python/cudf_polars/tests/utils/test_dtypes.py | 28 +++++++++- 3 files changed, 85 insertions(+), 4 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index e04c7ecf07f..a472daf53ca 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -19,6 +19,8 @@ from polars.exceptions import InvalidOperationError +from cudf_polars.utils.dtypes import is_order_preserving_cast + if TYPE_CHECKING: from typing_extensions import Self @@ -144,7 +146,10 @@ def astype(self, dtype: plc.DataType) -> Column: if dtype.id() == plc.TypeId.STRING or self.obj.type().id() == plc.TypeId.STRING: return Column(self._handle_string_cast(dtype)) else: - return Column(plc.unary.cast(self.obj, dtype)).sorted_like(self) + result = Column(plc.unary.cast(self.obj, dtype)) + if is_order_preserving_cast(self.obj.type(), dtype): + return result.sorted_like(self) + return result def _handle_string_cast(self, dtype: plc.DataType) -> plc.Column: if dtype.id() == plc.TypeId.STRING: diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 491d57584b6..75b76e8e25e 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -9,12 +9,21 @@ import pyarrow as pa import pylibcudf as plc -from pylibcudf.traits import is_numeric_not_bool +from pylibcudf.traits import ( + is_floating_point, + is_integral_not_bool, + is_numeric_not_bool, +) from typing_extensions import assert_never import polars as pl -__all__ = ["from_polars", "downcast_arrow_lists", "can_cast"] +__all__ = [ + "from_polars", + "downcast_arrow_lists", + "can_cast", + "is_order_preserving_cast", +] def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType: @@ -72,6 +81,47 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: ) +def is_order_preserving_cast(from_: plc.DataType, to: plc.DataType) -> bool: + """ + Determine if a cast would preserve the order of the source data. + + Parameters + ---------- + from_ + Source datatype + to + Target datatype + + Returns + ------- + True if the cast is order-preserving, False otherwise + """ + if from_.id() == to.id(): + return True + + if is_integral_not_bool(from_) and is_integral_not_bool(to): + # True if signedness is the same and the target is larger + if plc.traits.is_unsigned(from_) == plc.traits.is_unsigned(to): + if plc.types.size_of(to) >= plc.types.size_of(from_): + return True + elif (plc.traits.is_unsigned(from_) and not plc.traits.is_unsigned(to)) and ( + plc.types.size_of(to) > plc.types.size_of(from_) + ): + # Unsigned to signed is order preserving if target is large enough + # But signed to unsigned is never order preserving due to negative values + return True + elif is_floating_point(from_) and is_floating_point(to): + # True if the target is larger + if plc.types.size_of(to) > plc.types.size_of(from_): + return True + elif (is_integral_not_bool(from_) and is_floating_point(to)) and ( + plc.types.size_of(to) > plc.types.size_of(from_) + ): + # Int64 fits in float64, but not in float32 + return True + return False + + @cache def from_polars(dtype: pl.DataType) -> plc.DataType: """ diff --git a/python/cudf_polars/tests/utils/test_dtypes.py b/python/cudf_polars/tests/utils/test_dtypes.py index bbdb4faa256..5b510830329 100644 --- a/python/cudf_polars/tests/utils/test_dtypes.py +++ b/python/cudf_polars/tests/utils/test_dtypes.py @@ -3,11 +3,23 @@ from __future__ import annotations +import pylibcudf as plc import pytest import polars as pl -from cudf_polars.utils.dtypes import from_polars +from cudf_polars.utils.dtypes import from_polars, is_order_preserving_cast + +INT8 = plc.DataType(plc.TypeId.INT8) +INT16 = plc.DataType(plc.TypeId.INT16) +INT32 = plc.DataType(plc.TypeId.INT32) +INT64 = plc.DataType(plc.TypeId.INT64) +UINT8 = plc.DataType(plc.TypeId.UINT8) +UINT16 = plc.DataType(plc.TypeId.UINT16) +UINT32 = plc.DataType(plc.TypeId.UINT32) +UINT64 = plc.DataType(plc.TypeId.UINT64) +FLOAT32 = plc.DataType(plc.TypeId.FLOAT32) +FLOAT64 = plc.DataType(plc.TypeId.FLOAT64) @pytest.mark.parametrize( @@ -30,3 +42,17 @@ def test_unhandled_dtype_conversion_raises(pltype): with pytest.raises(NotImplementedError): _ = from_polars(pltype) + + +def test_is_order_preserving_cast(): + assert is_order_preserving_cast(INT8, INT8) # Same type + assert is_order_preserving_cast(INT8, INT16) # Smaller type + assert is_order_preserving_cast(INT8, FLOAT32) # Int to large enough float + assert is_order_preserving_cast(UINT8, UINT16) # Unsigned to larger unsigned + assert is_order_preserving_cast(UINT8, FLOAT32) # Unsigned to large enough float + assert is_order_preserving_cast(FLOAT32, FLOAT64) # Float to larger float + + assert not is_order_preserving_cast(INT16, INT8) # Bigger type + assert not is_order_preserving_cast(INT8, UINT8) # Different signedness + assert not is_order_preserving_cast(INT64, FLOAT32) # Int to undersized float + assert not is_order_preserving_cast(FLOAT32, INT32) # Float to undersized int From f98f635f6f905e8921f6c766aefbcc786841613f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 28 Oct 2024 07:49:06 -0700 Subject: [PATCH 12/19] small fix --- .../cudf_polars/tests/expressions/test_stringfunction.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 641608ffb0e..d2a1c12a109 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -86,14 +86,7 @@ def str_to_float_data(with_nulls): @pytest.fixture def str_from_integer_data(with_nulls, integer_type): - a = [ - 1, - 2, - 3, - 4, - 5, - 6, - ] + a = [1, 2, 3, 4, 5, 6] if with_nulls: a[4] = None return pl.LazyFrame({"a": pl.Series(a, dtype=integer_type)}) From 7f75375073b3e2eb89fc517efaf432dc69eecf85 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 28 Oct 2024 14:22:35 -0700 Subject: [PATCH 13/19] pass test_string_from_float --- .../tests/expressions/test_stringfunction.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index d2a1c12a109..8d7d970eb07 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -75,6 +75,8 @@ def str_to_float_data(with_nulls): "inf", "+inf", "-inf", + "Inf", + "-Inf", "nan", "-1.234", "2e2", @@ -425,8 +427,22 @@ def test_string_to_float(str_to_float_data, floating_type): assert_gpu_result_equal(query) -def test_string_from_float(str_from_float_data): +def test_string_from_float(request, str_from_float_data): + if str_from_float_data.collect_schema()["a"] == pl.Float32: + # libcudf will return a string representing the precision out to + # a certain number of hardcoded decimal places. This results in + # the fractional part being thrown away which causes discrepancies + # for certain numbers. For instance, the float32 representation of + # 1.1 is 1.100000023841858. When cast to a string, this will become + # 1.100000024. But the float64 representation of 1.1 is + # 1.1000000000000000888 which will result in libcudf truncating the + # final value to 1.1. + request.applymarker(pytest.mark.xfail(reason="libcudf truncation")) query = str_from_float_data.select(pl.col("a").cast(pl.String)) + + # libcudf reads float('inf') -> "inf" + # but polars reads float('inf') -> "Inf" + query = query.select(pl.col("a").str.to_lowercase()) assert_gpu_result_equal(query) From 9c9d395e84fade005839a6d4eb9a0d2d5b775e28 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 28 Oct 2024 15:14:49 -0700 Subject: [PATCH 14/19] add failing test to plugin xfail list --- python/cudf_polars/cudf_polars/testing/plugin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index a3607159e01..a611fdab9f8 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -158,6 +158,7 @@ def pytest_configure(config: pytest.Config): "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU", "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU", "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU", + "tests/unit/sql/test_cast.py::test_cast_errors[values5-values::int4-conversion from `str` to `i32` failed]": "Cast raises, but error user recieves is wrong", "tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv", "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception", "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match", From 00bd36c771ad8b2b2dfae2ca969198a5f2642cf5 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 29 Oct 2024 09:58:39 -0500 Subject: [PATCH 15/19] Update python/cudf_polars/cudf_polars/testing/plugin.py Co-authored-by: Lawrence Mitchell --- python/cudf_polars/cudf_polars/testing/plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index a611fdab9f8..34492858062 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -158,7 +158,7 @@ def pytest_configure(config: pytest.Config): "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU", "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU", "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU", - "tests/unit/sql/test_cast.py::test_cast_errors[values5-values::int4-conversion from `str` to `i32` failed]": "Cast raises, but error user recieves is wrong", + "tests/unit/sql/test_cast.py::test_cast_errors[values5-values::int4-conversion from `str` to `i32` failed]": "Cast raises, but error user receives is wrong", "tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv", "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception", "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match", From c06f984e0c2602750406e3e65ddbb2e9cfde3bd2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 29 Oct 2024 10:00:01 -0500 Subject: [PATCH 16/19] Update python/cudf_polars/cudf_polars/utils/dtypes.py Co-authored-by: Lawrence Mitchell --- python/cudf_polars/cudf_polars/utils/dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 75b76e8e25e..99f07b47ebd 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -112,7 +112,7 @@ def is_order_preserving_cast(from_: plc.DataType, to: plc.DataType) -> bool: return True elif is_floating_point(from_) and is_floating_point(to): # True if the target is larger - if plc.types.size_of(to) > plc.types.size_of(from_): + if plc.types.size_of(to) >= plc.types.size_of(from_): return True elif (is_integral_not_bool(from_) and is_floating_point(to)) and ( plc.types.size_of(to) > plc.types.size_of(from_) From a68011a774650bf7a2e2a2d4258494363b8f8a15 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 29 Oct 2024 09:03:47 -0700 Subject: [PATCH 17/19] minor fixups --- python/cudf_polars/cudf_polars/utils/dtypes.py | 14 ++++++-------- python/cudf_polars/tests/utils/test_dtypes.py | 3 ++- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 99f07b47ebd..81b360eb9fc 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -110,16 +110,14 @@ def is_order_preserving_cast(from_: plc.DataType, to: plc.DataType) -> bool: # Unsigned to signed is order preserving if target is large enough # But signed to unsigned is never order preserving due to negative values return True - elif is_floating_point(from_) and is_floating_point(to): - # True if the target is larger - if plc.types.size_of(to) >= plc.types.size_of(from_): - return True - elif (is_integral_not_bool(from_) and is_floating_point(to)) and ( - plc.types.size_of(to) > plc.types.size_of(from_) + elif ( + is_floating_point(from_) + and is_floating_point(to) + and (plc.types.size_of(to) >= plc.types.size_of(from_)) ): - # Int64 fits in float64, but not in float32 + # True if the target is larger return True - return False + return is_integral_not_bool(from_) and is_floating_point(to) @cache diff --git a/python/cudf_polars/tests/utils/test_dtypes.py b/python/cudf_polars/tests/utils/test_dtypes.py index 5b510830329..f9629ba8fb5 100644 --- a/python/cudf_polars/tests/utils/test_dtypes.py +++ b/python/cudf_polars/tests/utils/test_dtypes.py @@ -51,8 +51,9 @@ def test_is_order_preserving_cast(): assert is_order_preserving_cast(UINT8, UINT16) # Unsigned to larger unsigned assert is_order_preserving_cast(UINT8, FLOAT32) # Unsigned to large enough float assert is_order_preserving_cast(FLOAT32, FLOAT64) # Float to larger float + assert is_order_preserving_cast(INT64, FLOAT32) # Int any float assert not is_order_preserving_cast(INT16, INT8) # Bigger type assert not is_order_preserving_cast(INT8, UINT8) # Different signedness - assert not is_order_preserving_cast(INT64, FLOAT32) # Int to undersized float assert not is_order_preserving_cast(FLOAT32, INT32) # Float to undersized int + assert not is_order_preserving_cast(FLOAT32, INT64) # float to large int From cf62714f582370863c940ac27fed82b5aa59af26 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 4 Nov 2024 16:32:18 -0800 Subject: [PATCH 18/19] allow float to int --- python/cudf_polars/cudf_polars/utils/dtypes.py | 4 +++- python/cudf_polars/tests/utils/test_dtypes.py | 8 +++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index e7952699154..a90c283ee54 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -118,7 +118,9 @@ def is_order_preserving_cast(from_: plc.DataType, to: plc.DataType) -> bool: ): # True if the target is larger return True - return is_integral_not_bool(from_) and is_floating_point(to) + return (is_integral_not_bool(from_) and is_floating_point(to)) or ( + is_floating_point(from_) and is_integral_not_bool(to) + ) @cache diff --git a/python/cudf_polars/tests/utils/test_dtypes.py b/python/cudf_polars/tests/utils/test_dtypes.py index f9629ba8fb5..f63c2079e04 100644 --- a/python/cudf_polars/tests/utils/test_dtypes.py +++ b/python/cudf_polars/tests/utils/test_dtypes.py @@ -3,11 +3,12 @@ from __future__ import annotations -import pylibcudf as plc import pytest import polars as pl +import pylibcudf as plc + from cudf_polars.utils.dtypes import from_polars, is_order_preserving_cast INT8 = plc.DataType(plc.TypeId.INT8) @@ -52,8 +53,9 @@ def test_is_order_preserving_cast(): assert is_order_preserving_cast(UINT8, FLOAT32) # Unsigned to large enough float assert is_order_preserving_cast(FLOAT32, FLOAT64) # Float to larger float assert is_order_preserving_cast(INT64, FLOAT32) # Int any float + assert is_order_preserving_cast(FLOAT32, INT32) # Float to undersized int + assert is_order_preserving_cast(FLOAT32, INT64) # float to large int assert not is_order_preserving_cast(INT16, INT8) # Bigger type assert not is_order_preserving_cast(INT8, UINT8) # Different signedness - assert not is_order_preserving_cast(FLOAT32, INT32) # Float to undersized int - assert not is_order_preserving_cast(FLOAT32, INT64) # float to large int + assert not is_order_preserving_cast(FLOAT64, FLOAT32) # Smaller float From 0344f53fe2a8bfc14ef2d1f211c5fdbb63b846c2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 5 Nov 2024 08:35:58 -0800 Subject: [PATCH 19/19] small fixes --- python/cudf_polars/cudf_polars/containers/column.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index a472daf53ca..93d95346a37 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -8,6 +8,8 @@ import functools from typing import TYPE_CHECKING +from polars.exceptions import InvalidOperationError + import pylibcudf as plc from pylibcudf.strings.convert.convert_floats import from_floats, is_float, to_floats from pylibcudf.strings.convert.convert_integers import ( @@ -17,8 +19,6 @@ ) from pylibcudf.traits import is_floating_point -from polars.exceptions import InvalidOperationError - from cudf_polars.utils.dtypes import is_order_preserving_cast if TYPE_CHECKING: