From 7d49f75df9681dbe1653029e7d508355884a6d86 Mon Sep 17 00:00:00 2001 From: Mike Wendt <1915404+mike-wendt@users.noreply.github.com> Date: Tue, 30 Mar 2021 10:46:20 -0400 Subject: [PATCH 01/14] Update conda recipes pinning of repo dependencies (#7743) Ensure all conda packages created in this repo that depend on other packages are all version pinned to the same build number. This way it prevents a conda solve from picking mismatched versions of `cudf` and `libcudf` among others that can break this repo and others. Authors: - Mike Wendt (@mike-wendt) Approvers: - Ray Douglass (@raydouglass) URL: https://github.com/rapidsai/cudf/pull/7743 --- conda/recipes/cudf/meta.yaml | 2 +- conda/recipes/cudf_kafka/meta.yaml | 8 ++++---- conda/recipes/custreamz/meta.yaml | 8 ++++---- conda/recipes/dask-cudf/meta.yaml | 6 +++--- conda/recipes/libcudf_kafka/meta.yaml | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 5635f54ba20..a119040bbcf 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -28,7 +28,7 @@ requirements: - numba >=0.49.0 - dlpack - pyarrow 1.0.1 - - libcudf {{ version }} + - libcudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} - rmm {{ minor_version }} - cudatoolkit {{ cuda_version }} run: diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index 0acd9ec4bb2..cc3f30091bf 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -29,12 +29,12 @@ requirements: - python - cython >=0.29,<0.30 - setuptools - - cudf {{ version }} - - libcudf_kafka {{ version }} + - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} + - libcudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} run: - - libcudf_kafka {{ version }} + - libcudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} - python-confluent-kafka - - cudf {{ version }} + - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} test: requires: diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index ffda6d0c3c6..8edca7a51d0 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -23,15 +23,15 @@ requirements: host: - python - python-confluent-kafka - - cudf_kafka {{ version }} + - cudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} run: - python - - streamz - - cudf {{ version }} + - streamz + - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} - dask >=2.22.0 - distributed >=2.22.0 - python-confluent-kafka - - cudf_kafka {{ version }} + - cudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} test: requires: diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index 170075743bd..04992f8e481 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -22,15 +22,15 @@ build: requirements: host: - python - - cudf {{ version }} + - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} - dask >=2.22.0 - distributed >=2.22.0 run: - python - - cudf {{ version }} + - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} - dask >=2.22.0 - distributed >=2.22.0 - + test: requires: - cudatoolkit {{ cuda_version }}.* diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index 5348ec471e9..81ff922b8d7 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -25,7 +25,7 @@ requirements: build: - cmake >=3.17.0 host: - - libcudf {{ version }} + - libcudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} - librdkafka >=1.5.0,<1.5.3 run: - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not From ad9212b443167166b4ee83277a117c863506c6be Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 30 Mar 2021 15:11:28 -0500 Subject: [PATCH 02/14] Enable dask dispatch to cuDF's `is_categorical_dtype` for cuDF objects (#7740) Fixes https://github.com/rapidsai/cudf/issues/7111 Closes https://github.com/rapidsai/cudf/issues/7400 Authors: - @brandon-b-miller - Keith Kraus (@kkraus14) Approvers: - Keith Kraus (@kkraus14) - GALI PREM SAGAR (@galipremsagar) - Ray Douglass (@raydouglass) URL: https://github.com/rapidsai/cudf/pull/7740 --- conda/environments/cudf_dev_cuda10.1.yml | 2 +- conda/environments/cudf_dev_cuda10.2.yml | 2 +- conda/environments/cudf_dev_cuda11.0.yml | 2 +- conda/recipes/dask-cudf/meta.yaml | 4 ++-- python/dask_cudf/dask_cudf/backends.py | 13 ++++++++++- .../dask_cudf/tests/test_dispatch.py | 16 +++++++++++++ .../dask_cudf/dask_cudf/tests/test_onehot.py | 23 +++++++++++++++++-- 7 files changed, 54 insertions(+), 8 deletions(-) create mode 100644 python/dask_cudf/dask_cudf/tests/test_dispatch.py diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml index 35108ddd8ca..fa0b1126190 100644 --- a/conda/environments/cudf_dev_cuda10.1.yml +++ b/conda/environments/cudf_dev_cuda10.1.yml @@ -43,7 +43,7 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2.22.0 + - dask>=2021.3.1 - distributed>=2.22.0 - streamz - dlpack diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml index 3a24e38a397..52d82c4f4ef 100644 --- a/conda/environments/cudf_dev_cuda10.2.yml +++ b/conda/environments/cudf_dev_cuda10.2.yml @@ -43,7 +43,7 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2.22.0 + - dask>=2021.3.1 - distributed>=2.22.0 - streamz - dlpack diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 821c6f5320d..2e64365bdf6 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -43,7 +43,7 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2.22.0 + - dask>=2021.3.1 - distributed>=2.22.0 - streamz - dlpack diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index 04992f8e481..a8768e26056 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -23,12 +23,12 @@ requirements: host: - python - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} - - dask >=2.22.0 + - dask>=2021.3.1 - distributed >=2.22.0 run: - python - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} - - dask >=2.22.0 + - dask>=2021.3.1 - distributed >=2.22.0 test: diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index bb52ebce262..2a43aa06a8f 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -6,7 +6,11 @@ from dask.dataframe.categorical import categorical_dtype_dispatch from dask.dataframe.core import get_parallel_type, make_meta, meta_nonempty -from dask.dataframe.methods import concat_dispatch, tolist_dispatch +from dask.dataframe.methods import ( + concat_dispatch, + is_categorical_dtype_dispatch, + tolist_dispatch, +) from dask.dataframe.utils import ( UNKNOWN_CATEGORIES, _nonempty_scalar, @@ -220,6 +224,13 @@ def tolist_cudf(obj): return obj.to_arrow().to_pylist() +@is_categorical_dtype_dispatch.register( + (cudf.Series, cudf.Index, cudf.CategoricalDtype, Series) +) +def is_categorical_dtype_cudf(obj): + return cudf.utils.dtypes.is_categorical_dtype(obj) + + try: from dask.dataframe.utils import group_split_dispatch, hash_object_dispatch diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py new file mode 100644 index 00000000000..6bf4b956404 --- /dev/null +++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py @@ -0,0 +1,16 @@ +import pandas as pd + +from dask.dataframe.methods import is_categorical_dtype + +import cudf + + +def test_is_categorical_dispatch(): + assert is_categorical_dtype(pd.CategoricalDtype([1, 2, 3])) + assert is_categorical_dtype(cudf.CategoricalDtype([1, 2, 3])) + + assert is_categorical_dtype(cudf.Series([1, 2, 3], dtype="category")) + assert is_categorical_dtype(pd.Series([1, 2, 3], dtype="category")) + + assert is_categorical_dtype(pd.Index([1, 2, 3], dtype="category")) + assert is_categorical_dtype(cudf.Index([1, 2, 3], dtype="category")) diff --git a/python/dask_cudf/dask_cudf/tests/test_onehot.py b/python/dask_cudf/dask_cudf/tests/test_onehot.py index d5fb9e9a110..a9d88b5203c 100644 --- a/python/dask_cudf/dask_cudf/tests/test_onehot.py +++ b/python/dask_cudf/dask_cudf/tests/test_onehot.py @@ -3,10 +3,10 @@ from dask import dataframe as dd -import dask_cudf - import cudf +import dask_cudf + def test_get_dummies_cat(): df = pd.DataFrame({"C": [], "A": []}) @@ -101,3 +101,22 @@ def test_get_dummies_large(): dd.get_dummies(gddf).compute(), check_dtype=False, ) + + +def test_get_dummies_categorical(): + # https://github.com/rapidsai/cudf/issues/7111 + gdf = cudf.DataFrame({"A": ["a", "b", "b"], "B": [1, 2, 3]}) + pdf = gdf.to_pandas() + + gddf = dask_cudf.from_cudf(gdf, npartitions=1) + gddf = gddf.categorize(columns=["B"]) + + pddf = dd.from_pandas(pdf, npartitions=1) + pddf = pddf.categorize(columns=["B"]) + + expect = dd.get_dummies(pddf, columns=["B"]) + got = dd.get_dummies(gddf, columns=["B"]) + + dd.assert_eq( + expect, got, + ) From 635dc9c64005fe254608222d8c21fa31ddeef048 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 30 Mar 2021 19:12:12 -0400 Subject: [PATCH 03/14] Fixing issue with explode_outer position not nulling position entries of null rows (#7754) `explode_outer` supports writing a position column, but if the row was null it would incorrectly set the position to 0 and the row valid. Instead, it should null that position row as well. Luckily the null column matches 100% with the null column of the exploded column, so we can just copy it after it is created. Fixes #7721 Authors: - Mike Wilson (@hyperbolic2346) Approvers: - Conor Hoekstra (@codereport) - Jake Hemstad (@jrhemstad) URL: https://github.com/rapidsai/cudf/pull/7754 --- cpp/src/lists/explode.cu | 29 +++++++++++++++++------------ cpp/tests/lists/explode_tests.cpp | 21 +++++++++++---------- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu index 2b495deb47f..5f6f1c308ac 100644 --- a/cpp/src/lists/explode.cu +++ b/cpp/src/lists/explode.cu @@ -62,22 +62,27 @@ std::unique_ptr build_table( std::vector> columns = gathered_table.release()->release(); - columns.insert(columns.begin() + explode_column_idx, - explode_col_gather_map - ? std::move(detail::gather(table_view({sliced_child}), - explode_col_gather_map->begin(), - explode_col_gather_map->end(), - cudf::out_of_bounds_policy::NULLIFY, - stream, - mr) - ->release()[0]) - : std::make_unique(sliced_child, stream, mr)); + auto inserted = columns.insert(columns.begin() + explode_column_idx, + explode_col_gather_map + ? std::move(detail::gather(table_view({sliced_child}), + explode_col_gather_map->begin(), + explode_col_gather_map->end(), + cudf::out_of_bounds_policy::NULLIFY, + stream, + mr) + ->release()[0]) + : std::make_unique(sliced_child, stream, mr)); if (position_array) { size_type position_size = position_array->size(); + // the null mask for position matches the exploded column's gather map, so copy it over + rmm::device_buffer nullmask = + explode_col_gather_map ? copy_bitmask(*inserted->get()) : rmm::device_buffer(0, stream); columns.insert(columns.begin() + explode_column_idx, - std::make_unique( - data_type(type_to_id()), position_size, position_array->release())); + std::make_unique(data_type(type_to_id()), + position_size, + position_array->release(), + std::move(nullmask))); } return std::make_unique
(std::move(columns)); diff --git a/cpp/tests/lists/explode_tests.cpp b/cpp/tests/lists/explode_tests.cpp index 4c7ded0efd7..ded3d2b9193 100644 --- a/cpp/tests/lists/explode_tests.cpp +++ b/cpp/tests/lists/explode_tests.cpp @@ -530,7 +530,7 @@ TEST_F(ExplodeOuterTest, SingleNull) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 0, 1, 0, 0, 1}; + FCW expected_pos_col{{0, 0, 1, 0, 0, 1}, {0, 1, 1, 0, 1, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -561,7 +561,7 @@ TEST_F(ExplodeOuterTest, Nulls) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 1, 2, 0, 0, 1}; + FCW expected_pos_col{{0, 1, 2, 0, 0, 1}, {1, 1, 1, 0, 1, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -591,7 +591,7 @@ TEST_F(ExplodeOuterTest, AllNulls) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 0, 0}; + FCW expected_pos_col{{0, 0, 0}, {0, 0, 0}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -624,7 +624,7 @@ TEST_F(ExplodeOuterTest, SequentialNulls) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 1, 2, 0, 1, 0, 0, 0, 1, 2}; + FCW expected_pos_col{{0, 1, 2, 0, 1, 0, 0, 0, 1, 2}, {1, 1, 0, 1, 1, 0, 0, 1, 1, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -655,7 +655,7 @@ TEST_F(ExplodeOuterTest, MoreEmptyThanData) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 1, 0, 0, 0, 0, 0}; + FCW expected_pos_col{{0, 1, 0, 0, 0, 0, 0}, {1, 1, 0, 0, 0, 0, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -685,7 +685,7 @@ TEST_F(ExplodeOuterTest, TrailingEmptys) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 1, 0, 0, 0, 0}; + FCW expected_pos_col{{0, 1, 0, 0, 0, 0}, {1, 1, 0, 0, 0, 0}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -718,7 +718,7 @@ TEST_F(ExplodeOuterTest, LeadingNulls) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 0, 0, 0, 0, 1}; + FCW expected_pos_col{{0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 1, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -753,7 +753,7 @@ TEST_F(ExplodeOuterTest, NullsInList) CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 1, 2, 0, 1, 2, 3, 0, 0, 1, 2}; + FCW expected_pos_col{{0, 1, 2, 0, 1, 2, 3, 0, 0, 1, 2}, {1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -813,7 +813,7 @@ TEST_F(ExplodeOuterTest, NestedNulls) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 1, 0, 0, 1, 2}; + FCW expected_pos_col{{0, 1, 0, 0, 1, 2}, {1, 1, 0, 1, 1, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -884,7 +884,8 @@ TEST_F(ExplodeOuterTest, NullsInNestedDoubleExplode) CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1}; + FCW expected_pos_col{{0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1}, + {1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(first_explode_ret->view(), 0); From 4ee52f3dd6b26d76bddd5890d8ed7d70e1142aef Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 30 Mar 2021 19:46:29 -0500 Subject: [PATCH 04/14] Add `ignore_order` parameter to dask-cudf concat dispatch (#7765) A new parameter was added to dask upstream which is breaking dask-cudf: https://github.com/dask/dask/issues/7398 In this PR added that parameter to unblock the breakage. Authors: - GALI PREM SAGAR (@galipremsagar) Approvers: - @jakirkham - Keith Kraus (@kkraus14) URL: https://github.com/rapidsai/cudf/pull/7765 --- python/dask_cudf/dask_cudf/backends.py | 58 ++++++++++++++++++++------ 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 2a43aa06a8f..66b06acc858 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -1,9 +1,13 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. + +from distutils.version import LooseVersion + import cupy as cp import numpy as np import pandas as pd import pyarrow as pa +import dask from dask.dataframe.categorical import categorical_dtype_dispatch from dask.dataframe.core import get_parallel_type, make_meta, meta_nonempty from dask.dataframe.methods import ( @@ -27,6 +31,7 @@ get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame) get_parallel_type.register(cudf.Series, lambda _: Series) get_parallel_type.register(cudf.Index, lambda _: Index) +DASK_VERSION = LooseVersion(dask.__version__) @meta_nonempty.register(cudf.Index) @@ -200,18 +205,45 @@ def make_meta_object(x, index=None): raise TypeError(f"Don't know how to create metadata from {x}") -@concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index)) -def concat_cudf( - dfs, - axis=0, - join="outer", - uniform=False, - filter_warning=True, - sort=None, - ignore_index=False, -): - assert join == "outer" - return cudf.concat(dfs, axis=axis, ignore_index=ignore_index) +if DASK_VERSION > "2021.03.1": + + @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index)) + def concat_cudf( + dfs, + axis=0, + join="outer", + uniform=False, + filter_warning=True, + sort=None, + ignore_index=False, + **kwargs, + ): + assert join == "outer" + + ignore_order = kwargs.get("ignore_order", False) + if ignore_order: + raise NotImplementedError( + "ignore_order parameter is not yet supported in dask-cudf" + ) + + return cudf.concat(dfs, axis=axis, ignore_index=ignore_index) + + +else: + + @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index)) + def concat_cudf( + dfs, + axis=0, + join="outer", + uniform=False, + filter_warning=True, + sort=None, + ignore_index=False, + ): + assert join == "outer" + + return cudf.concat(dfs, axis=axis, ignore_index=ignore_index) @categorical_dtype_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index)) From bd11dbea741428f4463dc1f0eb0e084e0d77c853 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 30 Mar 2021 22:14:49 -0600 Subject: [PATCH 05/14] Fix NaN handling in drop_list_duplicates (#7662) This PR modifies the behavior of `drop_list_duplicates` to satisfy both Apache Spark and Pandas behavior when dealing with `NaN` value in floating-point columns data: * In Apache Spark, `NaNs` are treated as different values, thus no `NaN` entry should be removed after calling `drop_list_duplicates`. * In Pandas, `NaNs` are considered as the same value, and even `-NaN` is considered as the same as `NaN`. Thus, only one `NaN` entry per list will be kept. New tests have also been added to verify such desired behavior. Authors: - Nghia Truong (@ttnghia) Approvers: - Jake Hemstad (@jrhemstad) - @nvdbaranec - Keith Kraus (@kkraus14) URL: https://github.com/rapidsai/cudf/pull/7662 --- cpp/include/cudf/aggregation.hpp | 5 +- .../cudf/detail/aggregation/aggregation.hpp | 18 +- .../lists/detail/drop_list_duplicates.hpp | 1 + .../cudf/lists/drop_list_duplicates.hpp | 3 + cpp/include/cudf/types.hpp | 9 + cpp/src/aggregation/aggregation.cpp | 5 +- cpp/src/groupby/sort/aggregate.cpp | 13 +- cpp/src/lists/drop_list_duplicates.cu | 373 +++++++++++++++--- .../lists/drop_list_duplicates_tests.cpp | 305 ++++++++------ 9 files changed, 535 insertions(+), 197 deletions(-) diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 3c454c85720..74ce6e42d7e 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -230,10 +230,13 @@ std::unique_ptr make_collect_list_aggregation( * @param null_handling Indicates whether to include/exclude nulls during collection * @param nulls_equal Flag to specify whether null entries within each list should be considered * equal + * @param nans_equal Flag to specify whether NaN values in floating point column should be + * considered equal */ std::unique_ptr make_collect_set_aggregation( null_policy null_handling = null_policy::INCLUDE, - null_equality null_equal = null_equality::EQUAL); + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::UNEQUAL); /// Factory to create a LAG aggregation std::unique_ptr make_lag_aggregation(size_type offset); diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 18bef301e03..0bfe6b84ae2 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -345,24 +345,32 @@ struct collect_list_aggregation final : derived_aggregation */ struct collect_set_aggregation final : derived_aggregation { explicit collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE, - null_equality null_equal = null_equality::EQUAL) - : derived_aggregation{COLLECT_SET}, _null_handling{null_handling}, _null_equal(null_equal) + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::UNEQUAL) + : derived_aggregation{COLLECT_SET}, + _null_handling{null_handling}, + _nulls_equal(nulls_equal), + _nans_equal(nans_equal) { } null_policy _null_handling; ///< include or exclude nulls - null_equality _null_equal; ///< whether to consider nulls as equal values + null_equality _nulls_equal; ///< whether to consider nulls as equal values + nan_equality _nans_equal; ///< whether to consider NaNs as equal value (applicable only to + ///< floating point types) protected: friend class derived_aggregation; bool operator==(collect_set_aggregation const& other) const { - return _null_handling == other._null_handling && _null_equal == other._null_equal; + return _null_handling == other._null_handling && _nulls_equal == other._nulls_equal && + _nans_equal == other._nans_equal; } size_t hash_impl() const { - return std::hash{}(static_cast(_null_handling) ^ static_cast(_null_equal)); + return std::hash{}(static_cast(_null_handling) ^ static_cast(_nulls_equal) ^ + static_cast(_nans_equal)); } }; diff --git a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp index ba3e1d17d7f..53b31015145 100644 --- a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp +++ b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp @@ -31,6 +31,7 @@ namespace detail { std::unique_ptr drop_list_duplicates( lists_column_view const& lists_column, null_equality nulls_equal, + nan_equality nans_equal, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp index 0939bd7956a..f1ce3b7f0e3 100644 --- a/cpp/include/cudf/lists/drop_list_duplicates.hpp +++ b/cpp/include/cudf/lists/drop_list_duplicates.hpp @@ -41,6 +41,8 @@ namespace lists { * * @param lists_column The input lists_column_view * @param nulls_equal Flag to specify whether null entries should be considered equal + * @param nans_equal Flag to specify whether NaN entries should be considered as equal value (only + * applicable for floating point data column) * @param mr Device resource used to allocate memory * * @code{.pseudo} @@ -56,6 +58,7 @@ namespace lists { std::unique_ptr drop_list_duplicates( lists_column_view const& lists_column, null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::UNEQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 1b8d83883b3..789bb3037f4 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -137,6 +137,15 @@ enum class nan_policy : bool { NAN_IS_VALID ///< treat nans as valid elements (non-null) }; +/** + * @brief Enum to consider different elements (of floating point types) holding NaN value as equal + * or unequal + */ +enum class nan_equality /*unspecified*/ { + ALL_EQUAL, ///< All NaNs compare equal, regardless of sign + UNEQUAL ///< All NaNs compare unequal (IEEE754 behavior) +}; + /** * @brief */ diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index 33c19617308..3a044a42101 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -132,9 +132,10 @@ std::unique_ptr make_collect_list_aggregation(null_policy null_hand } /// Factory to create a COLLECT_SET aggregation std::unique_ptr make_collect_set_aggregation(null_policy null_handling, - null_equality null_equal) + null_equality nulls_equal, + nan_equality nans_equal) { - return std::make_unique(null_handling, null_equal); + return std::make_unique(null_handling, nulls_equal, nans_equal); } /// Factory to create a LAG aggregation std::unique_ptr make_lag_aggregation(size_type offset) diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 4e2303c8b9b..46185e07600 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -379,11 +379,14 @@ void aggregrate_result_functor::operator()(aggregation auto const collect_result = detail::group_collect( get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr); auto const nulls_equal = - static_cast(agg)._null_equal; - cache.add_result(col_idx, - agg, - lists::detail::drop_list_duplicates( - lists_column_view(collect_result->view()), nulls_equal, stream, mr)); + static_cast(agg)._nulls_equal; + auto const nans_equal = + static_cast(agg)._nans_equal; + cache.add_result( + col_idx, + agg, + lists::detail::drop_list_duplicates( + lists_column_view(collect_result->view()), nulls_equal, nans_equal, stream, mr)); }; } // namespace detail diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 584b9791d19..564d919b65d 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -16,15 +16,16 @@ #include #include -#include +#include #include #include #include #include #include -#include +#include #include +#include #include #include @@ -34,62 +35,100 @@ namespace cudf { namespace lists { namespace detail { namespace { +template +struct has_negative_nans { + column_device_view const d_entries; + bool const has_nulls; + + __device__ Type operator()(size_type idx) const noexcept + { + if (has_nulls && d_entries.is_null_nocheck(idx)) { return false; } + + auto const val = d_entries.element(idx); + return std::isnan(val) && std::signbit(val); // std::signbit(x) == true if x is negative + } +}; /** - * @brief Copy list entries and entry list offsets ignoring duplicates - * - * Given an array of all entries flattened from a list column and an array that maps each entry to - * the offset of the list containing that entry, those entries and list offsets are copied into - * new arrays such that the duplicated entries within each list will be ignored. - * - * @param all_lists_entries The input array containing all list entries - * @param entries_list_offsets A map from list entries to their corresponding list offsets - * @param nulls_equal Flag to specify whether null entries should be considered equal - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device resource used to allocate memory - * - * @return A pair of columns, the first one contains unique list entries and the second one - * contains their corresponding list offsets + * @brief A structure to be used along with type_dispatcher to check if a + * `column_view` has any negative NaN entry */ -template -std::vector> get_unique_entries_and_list_offsets( - column_view const& all_lists_entries, - column_view const& entries_list_offsets, - null_equality nulls_equal, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Create an intermediate table, since the comparator only work on tables - auto const device_input_table = - cudf::table_device_view::create(table_view{{all_lists_entries}}, stream); - auto const comp = row_equality_comparator( - *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL); +struct has_negative_nans_fn { + template >* = nullptr> + bool operator()(column_view const& lists_entries, rmm::cuda_stream_view stream) const noexcept + { + auto const d_entries = column_device_view::create(lists_entries, stream); + return thrust::count_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(lists_entries.size()), + detail::has_negative_nans{*d_entries, lists_entries.has_nulls()}); + } - auto const num_entries = all_lists_entries.size(); - // Allocate memory to store the indices of the unique entries - auto const unique_indices = cudf::make_numeric_column( - entries_list_offsets.type(), num_entries, mask_state::UNALLOCATED, stream); - auto const unique_indices_begin = unique_indices->mutable_view().begin(); + template >* = nullptr> + bool operator()(column_view const&, rmm::cuda_stream_view) const noexcept + { + // Columns of non floating-point data will never contain NaN + return false; + } +}; - auto const copy_end = thrust::unique_copy( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - unique_indices_begin, - [list_offsets = entries_list_offsets.begin(), comp] __device__(auto i, auto j) { - return list_offsets[i] == list_offsets[j] && comp(i, j); - }); +template +struct replace_negative_nans { + __device__ Type operator()(Type val) const noexcept + { + return std::isnan(val) ? std::numeric_limits::quiet_NaN() : val; + } +}; - // Collect unique entries and entry list offsets - auto const indices = cudf::detail::slice( - unique_indices->view(), 0, thrust::distance(unique_indices_begin, copy_end)); - return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}}, - indices, - cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr) - ->release(); +/** + * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for all entries + * of a floating-point data column + */ +struct replace_negative_nans_fn { + template >* = nullptr> + void operator()(column_view const&, mutable_column_view const&, rmm::cuda_stream_view) const + { + CUDF_FAIL("Cannot operate on a type that is not floating-point."); + } + + template >* = nullptr> + void operator()(column_view const& lists_entries, + mutable_column_view const& new_entries, + rmm::cuda_stream_view stream) const noexcept + { + // Do not care whether an entry is null or not, just consider it as a floating-point value + thrust::transform(rmm::exec_policy(stream), + lists_entries.begin(), + lists_entries.end(), + new_entries.begin(), + detail::replace_negative_nans{}); + } +}; + +/** + * @brief Transform a given lists column to a new lists column in which all the list entries holding + * -NaN value are replaced by (positive) NaN + */ +std::unique_ptr replace_negative_nans_entries(column_view const& lists_entries, + lists_column_view const& lists_column, + rmm::cuda_stream_view stream) +{ + auto new_offsets = std::make_unique(lists_column.offsets()); + auto new_entries = std::make_unique(lists_entries); + + type_dispatcher(lists_entries.type(), + detail::replace_negative_nans_fn{}, + lists_entries, + new_entries->mutable_view(), + stream); + + return make_lists_column( + lists_column.size(), + std::move(new_offsets), + std::move(new_entries), + lists_column.null_count(), + cudf::detail::copy_bitmask( + lists_column.parent(), stream, rmm::mr::get_current_device_resource())); } /** @@ -165,6 +204,189 @@ std::unique_ptr generate_entry_list_offsets(size_type num_entries, return entry_list_offsets; } +/** + * @brief Performs an equality comparison between two entries in a lists column + * + * For the two elements that are in the same list in the lists column, they will always be + * considered as different. If they are from the same list and their type is one of floating + * point types, this functor will return the same comparison result as + * `cudf::element_equality_comparator`. + * + * For floating-point types, entries holding NaN value can be considered as different values or the + * same value depending on the nans_equal parameter. + * + * @tparam Type The data type of entries + * @tparam nans_equal Flag to specify whether NaN entries should be considered as equal value (only + * applicable for floating-point data column) + */ +template +class list_entry_comparator { + public: + list_entry_comparator(offset_type const* list_offsets, + column_device_view d_view, + null_equality nulls_equal, + bool has_nulls) + : list_offsets(list_offsets), d_view{d_view}, nulls_equal{nulls_equal}, has_nulls(has_nulls) + { + } + + template + std::enable_if_t and nans_equal_, bool> __device__ + operator()(size_type i, size_type j) const noexcept + { + // Two entries are not considered for equality if they belong to different lists + if (list_offsets[i] != list_offsets[j]) { return false; } + + if (has_nulls) { + bool const nullable = d_view.nullable(); + bool const lhs_is_null{nullable and d_view.is_null_nocheck(i)}; + bool const rhs_is_null{nullable and d_view.is_null_nocheck(j)}; + if (lhs_is_null and rhs_is_null) { + return nulls_equal == null_equality::EQUAL; + } else if (lhs_is_null != rhs_is_null) { + return false; + } + } + + // For floating-point types, if both element(i) and element(j) are NaNs then this comparison + // will return `true`. This is the desired behavior in Pandas. + auto const lhs = d_view.element(i); + auto const rhs = d_view.element(j); + if (std::isnan(lhs) and std::isnan(rhs)) { return true; } + return lhs == rhs; + } + + template + std::enable_if_t or not nans_equal_, bool> __device__ + operator()(size_type i, size_type j) const noexcept + { + // Two entries are not considered for equality if they belong to different lists + if (list_offsets[i] != list_offsets[j]) { return false; } + + if (has_nulls) { + bool const nullable = d_view.nullable(); + bool const lhs_is_null{nullable and d_view.is_null_nocheck(i)}; + bool const rhs_is_null{nullable and d_view.is_null_nocheck(j)}; + if (lhs_is_null and rhs_is_null) { + return nulls_equal == null_equality::EQUAL; + } else if (lhs_is_null != rhs_is_null) { + return false; + } + } + + // For floating-point types, if both element(i) and element(j) are NaNs then this comparison + // will return `false`. This is the desired behavior in Apache Spark. + return d_view.element(i) == d_view.element(j); + } + + private: + offset_type const* list_offsets; + column_device_view d_view; + null_equality nulls_equal; + bool has_nulls; +}; + +/** + * @brief Construct type-dispatched function object for copying indices of the list entries + * ignoring duplicates + */ +struct get_unique_entries_fn { + template ()>* = nullptr> + offset_type* operator()(offset_type const*, + column_device_view&, + size_type, + offset_type*, + null_equality, + nan_equality, + bool, + rmm::cuda_stream_view) const + { + CUDF_FAIL("Cannot operate on types that are not equally comparable."); + } + + template ()>* = nullptr> + offset_type* operator()(offset_type const* list_offsets, + column_device_view& d_view, + size_type num_entries, + offset_type* output_begin, + null_equality nulls_equal, + nan_equality nans_equal, + bool has_nulls, + rmm::cuda_stream_view stream) const noexcept + { + if (nans_equal == nan_equality::ALL_EQUAL) { + list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); + } else { + list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); + } + } +}; + +/** + * @brief Copy list entries and entry list offsets ignoring duplicates + * + * Given an array of all entries flattened from a list column and an array that maps each entry to + * the offset of the list containing that entry, those entries and list offsets are copied into + * new arrays such that the duplicated entries within each list will be ignored. + * + * @param all_lists_entries The input array containing all list entries + * @param entries_list_offsets A map from list entries to their corresponding list offsets + * @param nulls_equal Flag to specify whether null entries should be considered equal + * @param nans_equal Flag to specify whether NaN entries should be considered as equal + * value (only applicable for floating-point data column) + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device resource used to allocate memory + * + * @return A pair of columns, the first one contains unique list entries and the second one + * contains their corresponding list offsets + */ +std::vector> get_unique_entries_and_list_offsets( + column_view const& all_lists_entries, + column_view const& entries_list_offsets, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_entries = all_lists_entries.size(); + auto const d_view_entries = column_device_view::create(all_lists_entries, stream); + + // Allocate memory to store the indices of the unique entries + auto unique_indices = rmm::device_uvector(num_entries, stream); + auto const output_begin = unique_indices.begin(); + auto const output_end = type_dispatcher(all_lists_entries.type(), + get_unique_entries_fn{}, + entries_list_offsets.begin(), + *d_view_entries, + num_entries, + output_begin, + nulls_equal, + nans_equal, + all_lists_entries.has_nulls(), + stream); + + // Collect unique entries and entry list offsets + // The new null_count and bitmask of the unique entries will also be generated + // by the gather function + return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}}, + output_begin, + output_end, + cudf::out_of_bounds_policy::DONT_CHECK, + stream, + mr) + ->release(); +} + /** * @brief Generate list offsets from entry offsets * @@ -225,6 +447,7 @@ void generate_offsets(size_type num_entries, return offsets[i - prefix_sum_empty_lists[i]]; }); } + } // anonymous namespace /** @@ -234,6 +457,7 @@ void generate_offsets(size_type num_entries, */ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, null_equality nulls_equal, + nan_equality nans_equal, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -242,27 +466,40 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu CUDF_FAIL("Nested types are not supported in drop_list_duplicates."); } - // Call segmented sort on the list elements and store them in a temporary column sorted_list - auto const sorted_lists = - detail::sort_lists(lists_column, order::ASCENDING, null_order::AFTER, stream); - // Flatten all entries (depth = 1) of the lists column - auto const all_lists_entries = lists_column_view(sorted_lists->view()).get_sliced_child(stream); + auto const lists_entries = lists_column.get_sliced_child(stream); + + // sorted_lists will store the results of the original lists after calling segmented_sort + auto const sorted_lists = [&]() { + // If nans_equal == ALL_EQUAL and the column contains lists of floating-point data type, + // we need to replace -NaN by NaN before sorting + auto const replace_negative_nan = + nans_equal == nan_equality::ALL_EQUAL and + type_dispatcher(lists_entries.type(), detail::has_negative_nans_fn{}, lists_entries, stream); + if (replace_negative_nan) { + // The column new_lists_column is temporary, thus we will not pass in `mr` + auto const new_lists_column = + detail::replace_negative_nans_entries(lists_entries, lists_column, stream); + return detail::sort_lists( + lists_column_view(new_lists_column->view()), order::ASCENDING, null_order::AFTER, stream); + } else { + return detail::sort_lists(lists_column, order::ASCENDING, null_order::AFTER, stream); + } + }(); + + auto const sorted_lists_entries = + lists_column_view(sorted_lists->view()).get_sliced_child(stream); // Generate a 0-based offset column auto lists_offsets = detail::generate_clean_offsets(lists_column, stream, mr); // Generate a mapping from list entries to offsets of the lists containing those entries auto const entries_list_offsets = - detail::generate_entry_list_offsets(all_lists_entries.size(), lists_offsets->view(), stream); + detail::generate_entry_list_offsets(sorted_lists_entries.size(), lists_offsets->view(), stream); // Copy non-duplicated entries (along with their list offsets) to new arrays - auto unique_entries_and_list_offsets = - all_lists_entries.has_nulls() - ? detail::get_unique_entries_and_list_offsets( - all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr) - : detail::get_unique_entries_and_list_offsets( - all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr); + auto unique_entries_and_list_offsets = detail::get_unique_entries_and_list_offsets( + sorted_lists_entries, entries_list_offsets->view(), nulls_equal, nans_equal, stream, mr); // Generate offsets for the new lists column detail::generate_offsets(unique_entries_and_list_offsets.front()->size(), @@ -271,6 +508,10 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu stream); // Construct a new lists column without duplicated entries + // Reuse the null_count and bitmask of the lists_column: those are the null information for + // the list elements (rows) + // For the entries of those lists (rows), their null_count and bitmask were generated separately + // during the step `get_unique_entries_and_list_offsets` above return make_lists_column(lists_column.size(), std::move(lists_offsets), std::move(unique_entries_and_list_offsets.front()), @@ -285,10 +526,12 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu */ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, null_equality nulls_equal, + nan_equality nans_equal, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::drop_list_duplicates(lists_column, nulls_equal, rmm::cuda_stream_default, mr); + return detail::drop_list_duplicates( + lists_column, nulls_equal, nans_equal, rmm::cuda_stream_default, mr); } } // namespace lists diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 0948ba96f62..bc413fd220a 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -14,174 +14,241 @@ * limitations under the License. */ -#include - #include #include +#include + +#include +#include + +#include +#include -using float_type = float; using int_type = int32_t; -using INT_LCW = cudf::test::lists_column_wrapper; -using FLT_LCW = cudf::test::lists_column_wrapper; -using STR_LCW = cudf::test::lists_column_wrapper; +using float_type = float; + +using LIST_COL_FLT = cudf::test::lists_column_wrapper; +using LIST_COL_STR = cudf::test::lists_column_wrapper; -template +auto constexpr neg_NaN = -std::numeric_limits::quiet_NaN(); +auto constexpr neg_Inf = -std::numeric_limits::infinity(); +auto constexpr NaN = std::numeric_limits::quiet_NaN(); +auto constexpr Inf = std::numeric_limits::infinity(); + +template void test_once(cudf::column_view const& input, LCW const& expected, cudf::null_equality nulls_equal = cudf::null_equality::EQUAL) { auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{input}, nulls_equal); - if (equal_test) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, true); + if (cudf::is_floating_point(input.type())) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } else { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } } struct DropListDuplicatesTest : public cudf::test::BaseFixture { }; -TEST_F(DropListDuplicatesTest, InvalidCasesTests) +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithSignedZero) { - // Lists of nested types are not supported - EXPECT_THROW( - cudf::lists::drop_list_duplicates(cudf::lists_column_view{INT_LCW{INT_LCW{{1, 2}, {3}}}}), - cudf::logic_error); - EXPECT_THROW( - cudf::lists::drop_list_duplicates(cudf::lists_column_view{FLT_LCW{FLT_LCW{{1, 2}, {3}}}}), - cudf::logic_error); - EXPECT_THROW( - cudf::lists::drop_list_duplicates(cudf::lists_column_view{STR_LCW{STR_LCW{STR_LCW{"string"}}}}), - cudf::logic_error); + // -0.0 and 0.0 should be considered equal + test_once(LIST_COL_FLT{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0}, + LIST_COL_FLT{0, 1, 2}); +} + +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInf) +{ + // Lists contain inf + test_once(LIST_COL_FLT{0, 1, 2, 0, 1, 2, 0, 1, 2, Inf, Inf, Inf}, LIST_COL_FLT{0, 1, 2, Inf}); + test_once(LIST_COL_FLT{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf}, + LIST_COL_FLT{neg_Inf, 0, Inf}); +} + +// The position of NaN is undefined after sorting, thus we need to offload the data to CPU to +// check for validity +// We will not store NaN in the results_expected variable (an unordered_set) because we can't check +// for NaN existence in a set. Instead, we will count the number of NaNs in the input and compare +// with the number of NaNs in the output. +static void test_floating_point(std::vector const& h_input, + std::unordered_set const& results_expected, + cudf::nan_equality nans_equal) +{ + // If NaNs are considered as equal value, the final result should always contain at max ONE NaN + // entry per list + std::size_t const num_NaNs = + nans_equal == cudf::nan_equality::ALL_EQUAL + ? std::size_t{1} + : std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); }); + + auto const results_col = cudf::lists::drop_list_duplicates( + cudf::lists_column_view{LIST_COL_FLT(h_input.begin(), h_input.end())}, + cudf::null_equality::EQUAL, + nans_equal); + auto const results_arr = + cudf::test::to_host(cudf::lists_column_view(results_col->view()).child()).first; + + EXPECT_EQ(results_arr.size(), results_expected.size() + num_NaNs); + + std::size_t NaN_count{0}; + std::unordered_set results; + for (auto const x : results_arr) { + if (std::isnan(x)) { + ++NaN_count; + } else { + results.insert(x); + } + } + EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == num_NaNs); } -TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull) +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithNaNs) +{ + std::vector h_input{ + 0, -1, 1, NaN, 2, 0, neg_NaN, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN}; + std::unordered_set results_expected{-2, -1, 0, 1, 2}; + test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); + test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL); +} + +TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfsAndNaNs) +{ + std::vector h_input{neg_Inf, 0, neg_NaN, 1, -1, -2, NaN, NaN, Inf, NaN, + neg_NaN, 2, -1, 0, neg_NaN, 1, 2, Inf, 0, 1, + neg_Inf, 2, neg_NaN, Inf, neg_NaN, neg_NaN, NaN, neg_Inf}; + std::unordered_set results_expected{-2, -1, 0, 1, 2, neg_Inf, Inf}; + test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); + test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL); +} + +TEST_F(DropListDuplicatesTest, StringTestsNonNull) { // Trivial cases - test_once(FLT_LCW{{}}, FLT_LCW{{}}); - test_once(FLT_LCW{{0, 1, 2, 3, 4, 5}, {}}, FLT_LCW{{0, 1, 2, 3, 4, 5}, {}}); + test_once(LIST_COL_STR{{}}, LIST_COL_STR{{}}); + test_once(LIST_COL_STR{"this", "is", "a", "string"}, LIST_COL_STR{"a", "is", "string", "this"}); - // Multiple empty lists - test_once(FLT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, - FLT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); + // One list column + test_once(LIST_COL_STR{"this", "is", "is", "is", "a", "string", "string"}, + LIST_COL_STR{"a", "is", "string", "this"}); - auto constexpr p_inf = std::numeric_limits::infinity(); - auto constexpr m_inf = -std::numeric_limits::infinity(); + // Multiple lists column + test_once( + LIST_COL_STR{LIST_COL_STR{"this", "is", "a", "no duplicate", "string"}, + LIST_COL_STR{"this", "is", "is", "a", "one duplicate", "string"}, + LIST_COL_STR{"this", "is", "is", "is", "a", "two duplicates", "string"}, + LIST_COL_STR{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}, + LIST_COL_STR{LIST_COL_STR{"a", "is", "no duplicate", "string", "this"}, + LIST_COL_STR{"a", "is", "one duplicate", "string", "this"}, + LIST_COL_STR{"a", "is", "string", "this", "two duplicates"}, + LIST_COL_STR{"a", "is", "string", "this", "three duplicates"}}); +} - // Lists contain inf - // We can't test for lists containing nan because the order of nan is - // undefined after sorting - test_once(FLT_LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, p_inf, p_inf, p_inf}, - FLT_LCW{0, 1, 2, p_inf}); - test_once(FLT_LCW{p_inf, 0, m_inf, 0, p_inf, 0, m_inf, 0, p_inf, 0, m_inf}, - FLT_LCW{m_inf, 0, p_inf}); +TEST_F(DropListDuplicatesTest, StringTestsWithNulls) +{ + auto const null = std::string(""); + + // One list column with null entries + test_once( + LIST_COL_STR{{"this", null, "is", "is", "is", "a", null, "string", null, "string"}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 1 && i != 6 && i != 8; })}, + LIST_COL_STR{{"a", "is", "string", "this", null}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}); + + // Multiple lists column with null lists and null entries + test_once( + LIST_COL_STR{ + {LIST_COL_STR{ + {"this", null, "is", null, "a", null, "no duplicate", null, "string"}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; })}, + LIST_COL_STR{}, + LIST_COL_STR{"this", "is", "is", "a", "one duplicate", "string"}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}, + LIST_COL_STR{{LIST_COL_STR{{"a", "is", "no duplicate", "string", "this", null}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i <= 4; })}, + LIST_COL_STR{}, + LIST_COL_STR{"a", "is", "one duplicate", "string", "this"}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}); } -TEST_F(DropListDuplicatesTest, IntegerTestsNonNull) +template +struct DropListDuplicatesTypedTest : public cudf::test::BaseFixture { +}; +#define LIST_COL cudf::test::lists_column_wrapper + +using TypesForTest = + cudf::test::Concat; +TYPED_TEST_CASE(DropListDuplicatesTypedTest, TypesForTest); + +TYPED_TEST(DropListDuplicatesTypedTest, InvalidInputTests) { + // Lists of nested types are not supported + EXPECT_THROW( + cudf::lists::drop_list_duplicates(cudf::lists_column_view{LIST_COL{LIST_COL{{1, 2}, {3}}}}), + cudf::logic_error); +} + +TYPED_TEST(DropListDuplicatesTypedTest, TrivialInputTests) +{ + // Empty input + test_once(LIST_COL{{}}, LIST_COL{{}}); + // Trivial cases - test_once(INT_LCW{{}}, INT_LCW{{}}); - test_once(INT_LCW{{0, 1, 2, 3, 4, 5}, {}}, INT_LCW{{0, 1, 2, 3, 4, 5}, {}}); + test_once(LIST_COL{0, 1, 2, 3, 4, 5}, LIST_COL{0, 1, 2, 3, 4, 5}); // Multiple empty lists - test_once(INT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, - INT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); + test_once(LIST_COL{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, + LIST_COL{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); +} +TYPED_TEST(DropListDuplicatesTypedTest, NonNullInputTests) +{ // Adjacent lists containing the same entries - test_once( - INT_LCW{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}}, - INT_LCW{{1}, {1, 2}, {2, 3}}); + test_once(LIST_COL{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}}, + LIST_COL{{1}, {1, 2}, {2, 3}}); // Sliced list column - auto const list0 = INT_LCW{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; + auto const list0 = + LIST_COL{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; auto const list1 = cudf::slice(list0, {0, 5})[0]; auto const list2 = cudf::slice(list0, {1, 5})[0]; auto const list3 = cudf::slice(list0, {1, 3})[0]; auto const list4 = cudf::slice(list0, {0, 3})[0]; - test_once(list0, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list1, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list2, INT_LCW{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list3, INT_LCW{{1, 2, 3, 4}, {5}}); - test_once(list4, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}}); + test_once(list0, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list1, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list2, LIST_COL{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list3, LIST_COL{{1, 2, 3, 4}, {5}}); + test_once(list4, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}}); } -TEST_F(DropListDuplicatesTest, IntegerTestsWithNulls) +TYPED_TEST(DropListDuplicatesTypedTest, WithNullInputTests) { - auto constexpr null = std::numeric_limits::max(); + auto constexpr null = TypeParam{0}; // null lists - test_once(INT_LCW{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 2 && i != 3; })}, - INT_LCW{{{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 2 && i != 3; })}); + test_once(LIST_COL{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 2 && i != 3; })}, + LIST_COL{{{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 2 && i != 3; })}); // null entries are equal - test_once( - INT_LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, - INT_LCW{{1, 3, 5, 7, 9, null}, - std::initializer_list{true, true, true, true, true, false}}); + test_once( + LIST_COL{std::initializer_list{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, + LIST_COL{std::initializer_list{1, 3, 5, 7, 9, null}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 5; })}); // nulls entries are not equal - test_once( - INT_LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, - INT_LCW{ - {1, 3, 5, 7, 9, null, null, null, null, null}, - std::initializer_list{true, true, true, true, true, false, false, false, false, false}}, + test_once( + LIST_COL{std::initializer_list{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, + LIST_COL{std::initializer_list{1, 3, 5, 7, 9, null, null, null, null, null}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 5; })}, cudf::null_equality::UNEQUAL); } - -TEST_F(DropListDuplicatesTest, StringTestsNonNull) -{ - // Trivial cases - test_once(STR_LCW{{}}, STR_LCW{{}}); - test_once(STR_LCW{"this", "is", "a", "string"}, STR_LCW{"a", "is", "string", "this"}); - - // One list column - test_once(STR_LCW{"this", "is", "is", "is", "a", "string", "string"}, - STR_LCW{"a", "is", "string", "this"}); - - // Multiple lists column - test_once( - STR_LCW{STR_LCW{"this", "is", "a", "no duplicate", "string"}, - STR_LCW{"this", "is", "is", "a", "one duplicate", "string"}, - STR_LCW{"this", "is", "is", "is", "a", "two duplicates", "string"}, - STR_LCW{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}, - STR_LCW{STR_LCW{"a", "is", "no duplicate", "string", "this"}, - STR_LCW{"a", "is", "one duplicate", "string", "this"}, - STR_LCW{"a", "is", "string", "this", "two duplicates"}, - STR_LCW{"a", "is", "string", "this", "three duplicates"}}); -} - -TEST_F(DropListDuplicatesTest, StringTestsWithNulls) -{ - auto const null = std::string(""); - - // One list column with null entries - test_once( - STR_LCW{{"this", null, "is", "is", "is", "a", null, "string", null, "string"}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 1 && i != 6 && i != 8; })}, - STR_LCW{{"a", "is", "string", "this", null}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}); - - // Multiple lists column with null lists and null entries - test_once( - STR_LCW{{STR_LCW{{"this", null, "is", null, "a", null, "no duplicate", null, "string"}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 2 == 0; })}, - STR_LCW{}, - STR_LCW{"this", "is", "is", "a", "one duplicate", "string"}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}, - STR_LCW{ - {STR_LCW{{"a", "is", "no duplicate", "string", "this", null}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i <= 4; })}, - STR_LCW{}, - STR_LCW{"a", "is", "one duplicate", "string", "this"}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}); -} From f285302877ca5ae94a0c8bf0a2c9ee34a1e4cd8b Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 31 Mar 2021 05:44:21 -0700 Subject: [PATCH 06/14] Adds `list.unique` API (#7664) Closes #7414 This PR adds `list.unique` API. Following `Series.unique` behavior, this API treats null values as equal, and treats all nans as equal. This API does not guarantee the order of list elements. Example: ```python >>> s = cudf.Series([[1, 1, 2, None, None], None, [np.nan, np.nan], []]) >>> s.list.unique() # Order of list elements is not gaurenteed 0 [1.0, 2.0, nan] 1 None 2 [nan] 3 [] dtype: list ``` Authors: - Michael Wang (@isVoid) Approvers: - Keith Kraus (@kkraus14) - Nghia Truong (@ttnghia) URL: https://github.com/rapidsai/cudf/pull/7664 --- .../_lib/cpp/lists/drop_list_duplicates.pxd | 15 +++++++ python/cudf/cudf/_lib/cpp/types.pxd | 4 ++ python/cudf/cudf/_lib/lists.pyx | 40 ++++++++++++++++++- python/cudf/cudf/core/column/lists.py | 36 +++++++++++++++++ python/cudf/cudf/tests/test_list.py | 34 ++++++++++++++++ 5 files changed, 127 insertions(+), 2 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd diff --git a/python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd b/python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd new file mode 100644 index 00000000000..40b1836f932 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd @@ -0,0 +1,15 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.types cimport null_equality, nan_equality + +cdef extern from "cudf/lists/drop_list_duplicates.hpp" \ + namespace "cudf::lists" nogil: + cdef unique_ptr[column] drop_list_duplicates( + const lists_column_view lists_column, + null_equality nulls_equal, + nan_equality nans_equal + ) except + diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd index bd1108b2cdf..1f2094b3958 100644 --- a/python/cudf/cudf/_lib/cpp/types.pxd +++ b/python/cudf/cudf/_lib/cpp/types.pxd @@ -46,6 +46,10 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil: EQUAL "cudf::null_equality::EQUAL" UNEQUAL "cudf::null_equality::UNEQUAL" + ctypedef enum nan_equality "cudf::nan_equality": + ALL_EQUAL "cudf::nan_equality::ALL_EQUAL" + UNEQUAL "cudf::nan_equality::UNEQUAL" + ctypedef enum type_id "cudf::type_id": EMPTY "cudf::type_id::EMPTY" INT8 "cudf::type_id::INT8" diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 7f745e58c67..e93cba20f65 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -10,6 +10,9 @@ from cudf._lib.cpp.lists.count_elements cimport ( from cudf._lib.cpp.lists.explode cimport ( explode_outer as cpp_explode_outer ) +from cudf._lib.cpp.lists.drop_list_duplicates cimport ( + drop_list_duplicates as cpp_drop_list_duplicates +) from cudf._lib.cpp.lists.sorting cimport ( sort_lists as cpp_sort_lists ) @@ -22,7 +25,13 @@ from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.cpp.types cimport size_type, order, null_order +from cudf._lib.cpp.types cimport ( + size_type, + null_equality, + order, + null_order, + nan_equality +) from cudf._lib.column cimport Column from cudf._lib.table cimport Table @@ -71,6 +80,34 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False): ) +def drop_list_duplicates(Column col, bool nulls_equal, bool nans_all_equal): + """ + nans_all_equal == True indicates that libcudf should treat any two elements + from {+nan, -nan} as equal, and as unequal otherwise. + nulls_equal == True indicates that libcudf should treat any two nulls as + equal, and as unequal otherwise. + """ + cdef shared_ptr[lists_column_view] list_view = ( + make_shared[lists_column_view](col.view()) + ) + cdef null_equality c_nulls_equal = ( + null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL + ) + cdef nan_equality c_nans_equal = ( + nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL + ) + + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_drop_list_duplicates(list_view.get()[0], + c_nulls_equal, + c_nans_equal) + ) + return Column.from_unique_ptr(move(c_result)) + + def sort_lists(Column col, bool ascending, str na_position): cdef shared_ptr[lists_column_view] list_view = ( make_shared[lists_column_view](col.view()) @@ -121,6 +158,5 @@ def contains_scalar(Column col, DeviceScalar search_key): list_view.get()[0], search_key_value[0], )) - result = Column.from_unique_ptr(move(c_result)) return result diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index b7f34e8c007..364675cd035 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -10,6 +10,7 @@ from cudf._lib.lists import ( contains_scalar, count_elements, + drop_list_duplicates, extract_element, sort_lists, ) @@ -361,6 +362,41 @@ def take(self, lists_indices): else: return res + def unique(self): + """ + Returns unique element for each list in the column, order for each + unique element is not guaranteed. + + Returns + ------- + ListColumn + + Examples + -------- + >>> s = cudf.Series([[1, 1, 2, None, None], None, [4, 4], []]) + >>> s + 0 [1.0, 1.0, 2.0, nan, nan] + 1 None + 2 [4.0, 4.0] + 3 [] + dtype: list + >>> s.list.unique() # Order of list element is not guaranteed + 0 [1.0, 2.0, nan] + 1 None + 2 [4.0] + 3 [] + dtype: list + """ + + if is_list_dtype(self._column.children[1].dtype): + raise NotImplementedError("Nested lists unique is not supported.") + + return self._return_or_inplace( + drop_list_duplicates( + self._column, nulls_equal=True, nans_all_equal=True + ) + ) + def sort_values( self, ascending=True, diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 5645ce60596..9906600304b 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,6 +1,7 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. import functools +import numpy as np import pandas as pd import pyarrow as pa import pytest @@ -162,6 +163,39 @@ def test_take_invalid(invalid, exception): gs.list.take(invalid) +@pytest.mark.parametrize( + ("data", "expected"), + [ + ([[1, 1, 2, 2], [], None, [3, 4, 5]], [[1, 2], [], None, [3, 4, 5]]), + ( + [[1.233, np.nan, 1.234, 3.141, np.nan, 1.234]], + [[1.233, 1.234, np.nan, 3.141]], + ), # duplicate nans + ([[1, 1, 2, 2, None, None]], [[1, 2, None]]), # duplicate nulls + ( + [[1.233, np.nan, None, 1.234, 3.141, np.nan, 1.234, None]], + [[1.233, 1.234, np.nan, None, 3.141]], + ), # duplicate nans and nulls + ([[2, None, 1, None, 2]], [[1, 2, None]]), + ([[], []], [[], []]), + ([[], None], [[], None]), + ], +) +def test_unique(data, expected): + """ + Pandas de-duplicates nans and nulls respectively in Series.unique. + `expected` is setup to mimic such behavior + """ + gs = cudf.Series(data, nan_as_null=False) + + got = gs.list.unique() + expected = cudf.Series(expected, nan_as_null=False).list.sort_values() + + got = got.list.sort_values() + + assert_eq(expected, got) + + def key_func_builder(x, na_position): if x is None: if na_position == "first": From c99fcef41bea8f063953b53bd68b096ec501081c Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Wed, 31 Mar 2021 08:32:28 -0500 Subject: [PATCH 07/14] Fix type dispatch for columnar replace_nulls (#7768) Fixes #7766 Fixes a type dispatch problem where cudf::replace_nulls was not dispatching on the appropriate type, causing a "No specialization exists for the given type" to be thrown when using its columnar form with fixed-point types. Authors: - Jason Lowe (@jlowe) Approvers: - Mike Wilson (@hyperbolic2346) - Jake Hemstad (@jrhemstad) URL: https://github.com/rapidsai/cudf/pull/7768 --- cpp/src/replace/nulls.cu | 2 +- cpp/tests/replace/replace_nulls_tests.cpp | 148 +++++++++++++++++++++- 2 files changed, 148 insertions(+), 2 deletions(-) diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index afc2bbb37bd..65750deaa57 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -426,7 +426,7 @@ std::unique_ptr replace_nulls(cudf::column_view const& input, if (input.is_empty()) { return cudf::empty_like(input); } if (!input.has_nulls()) { return std::make_unique(input); } - return cudf::type_dispatcher( + return cudf::type_dispatcher( input.type(), replace_nulls_column_kernel_forwarder{}, input, replacement, stream, mr); } diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp index bd3bf7ddd03..e969f53609e 100644 --- a/cpp/tests/replace/replace_nulls_tests.cpp +++ b/cpp/tests/replace/replace_nulls_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2019, NVIDIA CORPORATION. + * Copyright 2019-2021, NVIDIA CORPORATION. * * Copyright 2018 BlazingDB, Inc. * Copyright 2018 Alexander Ocsa @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -437,6 +438,151 @@ TYPED_TEST(ReplaceNullsPolicyTest, FollowingFillTrailingNulls) cudf::replace_policy::FOLLOWING); } +template +struct ReplaceNullsFixedPointTest : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(ReplaceNullsFixedPointTest, cudf::test::FixedPointTypes); + +TYPED_TEST(ReplaceNullsFixedPointTest, ReplaceColumn) +{ + auto const scale = numeric::scale_type{0}; + auto const sz = std::size_t{1000}; + auto data_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return TypeParam{i, scale}; + }); + auto valid_begin = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i % 3 ? 1 : 0; }); + auto replace_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return TypeParam{-2, scale}; + }); + auto expected_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + int val = i % 3 ? static_cast(i) : -2; + return TypeParam{val, scale}; + }); + + ReplaceNullsColumn( + cudf::test::fixed_width_column_wrapper(data_begin, data_begin + sz, valid_begin), + cudf::test::fixed_width_column_wrapper(replace_begin, replace_begin + sz), + cudf::test::fixed_width_column_wrapper(expected_begin, expected_begin + sz)); +} + +TYPED_TEST(ReplaceNullsFixedPointTest, ReplaceColumn_Empty) +{ + ReplaceNullsColumn(cudf::test::fixed_width_column_wrapper{}, + cudf::test::fixed_width_column_wrapper{}, + cudf::test::fixed_width_column_wrapper{}); +} + +TYPED_TEST(ReplaceNullsFixedPointTest, ReplaceScalar) +{ + auto const scale = numeric::scale_type{0}; + auto const sz = std::size_t{1000}; + auto data_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return TypeParam{i, scale}; + }); + auto valid_begin = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i % 3 ? 1 : 0; }); + auto expected_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + int val = i % 3 ? static_cast(i) : -2; + return TypeParam{val, scale}; + }); + + cudf::fixed_point_scalar replacement{-2, scale}; + + ReplaceNullsScalar( + cudf::test::fixed_width_column_wrapper(data_begin, data_begin + sz, valid_begin), + replacement, + cudf::test::fixed_width_column_wrapper(expected_begin, expected_begin + sz)); +} + +TYPED_TEST(ReplaceNullsFixedPointTest, ReplacementHasNulls) +{ + auto const scale = numeric::scale_type{0}; + auto const sz = std::size_t{1000}; + auto data_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return TypeParam{i, scale}; + }); + auto data_valid_begin = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i % 3 ? 1 : 0; }); + auto replace_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return TypeParam{-2, scale}; + }); + auto replace_valid_begin = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i % 2 ? 1 : 0; }); + auto expected_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + int val = i % 3 ? static_cast(i) : -2; + return TypeParam{val, scale}; + }); + auto expected_valid_begin = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i % 6 ? 1 : 0; }); + + ReplaceNullsColumn(cudf::test::fixed_width_column_wrapper( + data_begin, data_begin + sz, data_valid_begin), + cudf::test::fixed_width_column_wrapper( + replace_begin, replace_begin + sz, replace_valid_begin), + cudf::test::fixed_width_column_wrapper( + expected_begin, expected_begin + sz, expected_valid_begin)); +} + +template +struct ReplaceNullsPolicyFixedPointTest : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(ReplaceNullsPolicyFixedPointTest, cudf::test::FixedPointTypes); + +TYPED_TEST(ReplaceNullsPolicyFixedPointTest, PrecedingFill) +{ + using fp = TypeParam; + auto const s = numeric::scale_type{0}; + auto col = cudf::test::fixed_width_column_wrapper( + {fp{42, s}, fp{2, s}, fp{1, s}, fp{-10, s}, fp{20, s}, fp{-30, s}}, {1, 0, 0, 1, 0, 1}); + auto expect_col = cudf::test::fixed_width_column_wrapper( + {fp{42, s}, fp{42, s}, fp{42, s}, fp{-10, s}, fp{-10, s}, fp{-30, s}}, {1, 1, 1, 1, 1, 1}); + + TestReplaceNullsWithPolicy( + std::move(col), std::move(expect_col), cudf::replace_policy::PRECEDING); +} + +TYPED_TEST(ReplaceNullsPolicyFixedPointTest, FollowingFill) +{ + using fp = TypeParam; + auto const s = numeric::scale_type{0}; + auto col = cudf::test::fixed_width_column_wrapper( + {fp{42, s}, fp{2, s}, fp{1, s}, fp{-10, s}, fp{20, s}, fp{-30, s}}, {1, 0, 0, 1, 0, 1}); + auto expect_col = cudf::test::fixed_width_column_wrapper( + {fp{42, s}, fp{-10, s}, fp{-10, s}, fp{-10, s}, fp{-30, s}, fp{-30, s}}, {1, 1, 1, 1, 1, 1}); + + TestReplaceNullsWithPolicy( + std::move(col), std::move(expect_col), cudf::replace_policy::FOLLOWING); +} + +TYPED_TEST(ReplaceNullsPolicyFixedPointTest, PrecedingFillLeadingNulls) +{ + using fp = TypeParam; + auto const s = numeric::scale_type{0}; + auto col = cudf::test::fixed_width_column_wrapper( + {fp{1, s}, fp{2, s}, fp{3, s}, fp{4, s}, fp{5, s}}, {0, 0, 1, 0, 1}); + auto expect_col = cudf::test::fixed_width_column_wrapper( + {fp{1, s}, fp{2, s}, fp{3, s}, fp{3, s}, fp{5, s}}, {0, 0, 1, 1, 1}); + + TestReplaceNullsWithPolicy( + std::move(col), std::move(expect_col), cudf::replace_policy::PRECEDING); +} + +TYPED_TEST(ReplaceNullsPolicyFixedPointTest, FollowingFillTrailingNulls) +{ + using fp = TypeParam; + auto const s = numeric::scale_type{0}; + auto col = cudf::test::fixed_width_column_wrapper( + {fp{1, s}, fp{2, s}, fp{3, s}, fp{4, s}, fp{5, s}}, {1, 0, 1, 0, 0}); + auto expect_col = cudf::test::fixed_width_column_wrapper( + {fp{1, s}, fp{3, s}, fp{3, s}, fp{4, s}, fp{5, s}}, {1, 1, 1, 0, 0}); + + TestReplaceNullsWithPolicy( + std::move(col), std::move(expect_col), cudf::replace_policy::FOLLOWING); +} + struct ReplaceDictionaryTest : public cudf::test::BaseFixture { }; From be2f0c000f2455a42d299f959e9e816b381ec315 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Wed, 31 Mar 2021 09:19:28 -0500 Subject: [PATCH 08/14] Fix Java explode outer unit tests (#7782) After #7754 the Java explode outer unit tests were not updated to expect the nulls. Authors: - Jason Lowe (@jlowe) Approvers: - Robert (Bobby) Evans (@revans2) URL: https://github.com/rapidsai/cudf/pull/7782 --- java/src/test/java/ai/rapids/cudf/TableTest.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 9c67966c16c..8b7ece5d60b 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -5067,7 +5067,7 @@ private Table[] buildExplodeTestTableWithPrimitiveTypes(boolean pos, boolean out .build()) { Table.TestBuilder expectedBuilder = new Table.TestBuilder(); if (pos) { - Integer[] posData = outer ? new Integer[]{0, 1, 2, 0, 1, 0, 0, 0} : new Integer[]{0, 1, 2, 0, 1, 0}; + Integer[] posData = outer ? new Integer[]{0, 1, 2, 0, 1, 0, null, null} : new Integer[]{0, 1, 2, 0, 1, 0}; expectedBuilder.column(posData); } List expectedData = new ArrayList(){{ @@ -5109,10 +5109,11 @@ private Table[] buildExplodeTestTableWithNestedTypes(boolean pos, boolean outer) .build()) { Table.TestBuilder expectedBuilder = new Table.TestBuilder(); if (pos) { - if (!outer) + if (outer) { + expectedBuilder.column(0, 1, 2, 0, 1, 0, null, null); + } else { expectedBuilder.column(0, 1, 2, 0, 1, 0, 0); - else - expectedBuilder.column(0, 1, 2, 0, 1, 0, 0, 0); + } } List expectedData = new ArrayList(){{ if (!outer) { From b9371122eacf8c1376f0185df409e906d7b3c4e5 Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Wed, 31 Mar 2021 09:57:11 -0500 Subject: [PATCH 09/14] get_json_object() implementation (#7286) An implementation of get_json_object(). Reference: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-get_json_object The fundamental functionality here is running a JSONPath query on each row in an input column of json strings. JSONPath spec: https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html For review purposes, the key entry point is `parse_json_path()`. Each thread of the kernel processes 1 row via this function. The behavior is recursive in nature but we maintain our own context stack to do it in loop fashion. `parse_json_path` is just the high level controlling logic, with most of the heavy lifting happening in the `json_state` parser class. Though the "heavy lifting" is pretty much just traditional string parsing code. The path to optimization here (I'll open a separate cudf issue for this) is - Change `parse_json_path` to work on a warp basis. So each row in the column would be processed by one warp. - Make the `json_state` parser class thread/warp aware (the class would just store its `tid` and operate accordingly). I think this is reasonably straightforward to do as most of the cuIO decoding kernels behave like this. Authors: - @nvdbaranec - Raza Jafri (@razajafri) Approvers: - Ray Douglass (@raydouglass) - Jason Lowe (@jlowe) - Jake Hemstad (@jrhemstad) - David (@davidwendt) URL: https://github.com/rapidsai/cudf/pull/7286 --- conda/recipes/libcudf/meta.yaml | 2 + cpp/CMakeLists.txt | 1 + cpp/benchmarks/CMakeLists.txt | 5 + cpp/benchmarks/string/json_benchmark.cpp | 140 +++ cpp/include/cudf/strings/detail/json.hpp | 40 + cpp/include/cudf/strings/json.hpp | 50 + cpp/include/doxygen_groups.h | 1 + cpp/src/io/csv/csv_gpu.cu | 6 +- cpp/src/io/json/json_gpu.cu | 4 +- cpp/src/io/utilities/parsing_utils.cuh | 144 +-- cpp/src/strings/json/json_path.cu | 952 ++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/strings/json_tests.cpp | 761 ++++++++++++++ cpp/tests/utilities/column_utilities.cu | 2 +- .../main/java/ai/rapids/cudf/ColumnView.java | 19 + java/src/main/native/src/ColumnViewJni.cpp | 23 + .../java/ai/rapids/cudf/ColumnVectorTest.java | 44 + 17 files changed, 2117 insertions(+), 78 deletions(-) create mode 100644 cpp/benchmarks/string/json_benchmark.cpp create mode 100644 cpp/include/cudf/strings/detail/json.hpp create mode 100644 cpp/include/cudf/strings/json.hpp create mode 100644 cpp/src/strings/json/json_path.cu create mode 100644 cpp/tests/strings/json_tests.cpp diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 39587b4bd05..75955428eab 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -178,12 +178,14 @@ test: - test -f $PREFIX/include/cudf/strings/detail/converters.hpp - test -f $PREFIX/include/cudf/strings/detail/copying.hpp - test -f $PREFIX/include/cudf/strings/detail/fill.hpp + - test -f $PREFIX/include/cudf/strings/detail/json.hpp - test -f $PREFIX/include/cudf/strings/detail/replace.hpp - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp - test -f $PREFIX/include/cudf/strings/extract.hpp - test -f $PREFIX/include/cudf/strings/findall.hpp - test -f $PREFIX/include/cudf/strings/find.hpp - test -f $PREFIX/include/cudf/strings/find_multiple.hpp + - test -f $PREFIX/include/cudf/strings/json.hpp - test -f $PREFIX/include/cudf/strings/padding.hpp - test -f $PREFIX/include/cudf/strings/replace.hpp - test -f $PREFIX/include/cudf/strings/replace_re.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5cd82e52180..61cb13d3445 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -346,6 +346,7 @@ add_library(cudf src/strings/find.cu src/strings/find_multiple.cu src/strings/padding.cu + src/strings/json/json_path.cu src/strings/regex/regcomp.cpp src/strings/regex/regexec.cu src/strings/replace/backref_re.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 5aa7e0132f8..11af408f1c5 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -202,3 +202,8 @@ ConfigureBench(STRINGS_BENCH string/substring_benchmark.cpp string/translate_benchmark.cpp string/url_decode_benchmark.cpp) + +################################################################################################### +# - json benchmark ------------------------------------------------------------------- +ConfigureBench(JSON_BENCH + string/json_benchmark.cpp) diff --git a/cpp/benchmarks/string/json_benchmark.cpp b/cpp/benchmarks/string/json_benchmark.cpp new file mode 100644 index 00000000000..6fb6a07a8d0 --- /dev/null +++ b/cpp/benchmarks/string/json_benchmark.cpp @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +#include +#include + +class JsonPath : public cudf::benchmark { +}; + +float frand() { return static_cast(rand()) / static_cast(RAND_MAX); } + +int rand_range(int min, int max) { return min + static_cast(frand() * (max - min)); } + +std::vector Books{ + "{\n\"category\": \"reference\",\n\"author\": \"Nigel Rees\",\n\"title\": \"Sayings of the " + "Century\",\n\"price\": 8.95\n}", + "{\n\"category\": \"fiction\",\n\"author\": \"Evelyn Waugh\",\n\"title\": \"Sword of " + "Honour\",\n\"price\": 12.99\n}", + "{\n\"category\": \"fiction\",\n\"author\": \"Herman Melville\",\n\"title\": \"Moby " + "Dick\",\n\"isbn\": \"0-553-21311-3\",\n\"price\": 8.99\n}", + "{\n\"category\": \"fiction\",\n\"author\": \"J. R. R. Tolkien\",\n\"title\": \"The Lord of the " + "Rings\",\n\"isbn\": \"0-395-19395-8\",\n\"price\": 22.99\n}"}; +constexpr int Approx_book_size = 110; +std::vector Bicycles{ + "{\"color\": \"red\", \"price\": 9.95}", + "{\"color\": \"green\", \"price\": 29.95}", + "{\"color\": \"blue\", \"price\": 399.95}", + "{\"color\": \"yellow\", \"price\": 99.95}", + "{\"color\": \"mauve\", \"price\": 199.95}", +}; +constexpr int Approx_bicycle_size = 33; +std::string Misc{"\n\"expensive\": 10\n"}; +std::string generate_field(std::vector const& values, int num_values) +{ + std::string res; + for (int idx = 0; idx < num_values; idx++) { + if (idx > 0) { res += std::string(",\n"); } + int vindex = std::min(static_cast(floor(frand() * values.size())), + static_cast(values.size() - 1)); + res += values[vindex]; + } + return res; +} + +std::string build_row(int desired_bytes) +{ + // always have at least 2 books and 2 bikes + int num_books = 2; + int num_bicycles = 2; + int remaining_bytes = + desired_bytes - ((num_books * Approx_book_size) + (num_bicycles * Approx_bicycle_size)); + + // divide up the remainder between books and bikes + float book_pct = frand(); + float bicycle_pct = 1.0f - book_pct; + num_books += (remaining_bytes * book_pct) / Approx_book_size; + num_bicycles += (remaining_bytes * bicycle_pct) / Approx_bicycle_size; + + std::string books = "\"book\": [\n" + generate_field(Books, num_books) + "]\n"; + std::string bicycles = "\"bicycle\": [\n" + generate_field(Bicycles, num_bicycles) + "]\n"; + + std::string store = "\"store\": {\n"; + if (frand() <= 0.5f) { + store += books + std::string(",\n") + bicycles; + } else { + store += bicycles + std::string(",\n") + books; + } + store += std::string("}\n"); + + std::string row = std::string("{\n"); + if (frand() <= 0.5f) { + row += store + std::string(",\n") + Misc; + } else { + row += Misc + std::string(",\n") + store; + } + row += std::string("}\n"); + return row; +} + +template +static void BM_case(benchmark::State& state, QueryArg&&... query_arg) +{ + srand(5236); + auto iter = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [desired_bytes = state.range(1)](int index) { return build_row(desired_bytes); }); + int num_rows = state.range(0); + cudf::test::strings_column_wrapper input(iter, iter + num_rows); + cudf::strings_column_view scv(input); + size_t num_chars = scv.chars().size(); + + std::string json_path(query_arg...); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + auto result = cudf::strings::get_json_object(scv, json_path); + cudaStreamSynchronize(0); + } + + // this isn't strictly 100% accurate. a given query isn't necessarily + // going to visit every single incoming character. but in spirit it does. + state.SetBytesProcessed(state.iterations() * num_chars); +} + +#define JSON_BENCHMARK_DEFINE(name, query) \ + BENCHMARK_CAPTURE(BM_case, name, query) \ + ->ArgsProduct({{100, 1000, 100000, 400000}, {300, 600, 4096}}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +JSON_BENCHMARK_DEFINE(query0, "$"); +JSON_BENCHMARK_DEFINE(query1, "$.store"); +JSON_BENCHMARK_DEFINE(query2, "$.store.book"); +JSON_BENCHMARK_DEFINE(query3, "$.store.*"); +JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]"); +JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category"); +JSON_BENCHMARK_DEFINE(query6, "$.store['bicycle']"); +JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']"); +JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]"); diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/strings/detail/json.hpp new file mode 100644 index 00000000000..e6a0b49f102 --- /dev/null +++ b/cpp/include/cudf/strings/detail/json.hpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { + +/** + * @copydoc cudf::strings::get_json_object + * + * @param stream CUDA stream used for device memory operations and kernel launches + */ +std::unique_ptr get_json_object( + cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp new file mode 100644 index 00000000000..b39e4a2027c --- /dev/null +++ b/cpp/include/cudf/strings/json.hpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cudf { +namespace strings { + +/** + * @addtogroup strings_json + * @{ + * @file + */ + +/** + * @brief Apply a JSONPath string to all rows in an input strings column. + * + * Applies a JSONPath string to an incoming strings column where each row in the column + * is a valid json string. The output is returned by row as a strings column. + * + * https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html + * Implements only the operators: $ . [] * + * + * @param col The input strings column. Each row must contain a valid json string + * @param json_path The JSONPath string to be applied to each row + * @param mr Resource for allocating device memory. + * @return New strings column containing the retrieved json object strings + */ +std::unique_ptr get_json_object( + cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of doxygen group +} // namespace strings +} // namespace cudf diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index 65dd5c73475..f78ff98d49d 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -127,6 +127,7 @@ * @defgroup strings_modify Modifying * @defgroup strings_replace Replacing * @defgroup strings_split Splitting + * @defgroup strings_json JSON * @} * @defgroup dictionary_apis Dictionary * @{ diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 86e5f1fdcae..44acc7fc55f 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -196,7 +196,7 @@ __global__ void __launch_bounds__(csvparse_block_dim) } else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) || serialized_trie_contains(opts.trie_false, {field_start, field_len})) { atomicAdd(&d_columnData[actual_col].bool_count, 1); - } else if (cudf::io::gpu::is_infinity(field_start, next_delimiter)) { + } else if (cudf::io::is_infinity(field_start, next_delimiter)) { atomicAdd(&d_columnData[actual_col].float_count, 1); } else { long countNumber = 0; @@ -277,7 +277,7 @@ __inline__ __device__ T decode_value(char const *begin, char const *end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } template @@ -285,7 +285,7 @@ __inline__ __device__ T decode_value(char const *begin, char const *end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } template <> diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index 5efb64fd4d5..75910ae6b5b 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -114,7 +114,7 @@ __inline__ __device__ T decode_value(const char *begin, uint64_t end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } /** @@ -131,7 +131,7 @@ __inline__ __device__ T decode_value(const char *begin, const char *end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } /** diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 584d2c9a74a..b7719cba580 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -20,6 +20,8 @@ #include #include +#include + #include using cudf::device_span; @@ -82,67 +84,6 @@ struct parse_options { } }; -namespace gpu { -/** - * @brief CUDA kernel iterates over the data until the end of the current field - * - * Also iterates over (one or more) delimiter characters after the field. - * Function applies to formats with field delimiters and line terminators. - * - * @param begin Pointer to the first element of the string - * @param end Pointer to the first element after the string - * @param opts A set of parsing options - * @param escape_char A boolean value to signify whether to consider `\` as escape character or - * just a character. - * - * @return Pointer to the last character in the field, including the - * delimiter(s) following the field data - */ -__device__ __inline__ char const* seek_field_end(char const* begin, - char const* end, - parse_options_view const& opts, - bool escape_char = false) -{ - bool quotation = false; - auto current = begin; - bool escape_next = false; - while (true) { - // Use simple logic to ignore control chars between any quote seq - // Handles nominal cases including doublequotes within quotes, but - // may not output exact failures as PANDAS for malformed fields. - // Check for instances such as "a2\"bc" and "\\" if `escape_char` is true. - - if (*current == opts.quotechar and not escape_next) { - quotation = !quotation; - } else if (!quotation) { - if (*current == opts.delimiter) { - while (opts.multi_delimiter && current < end && *(current + 1) == opts.delimiter) { - ++current; - } - break; - } else if (*current == opts.terminator) { - break; - } else if (*current == '\r' && (current + 1 < end && *(current + 1) == '\n')) { - --end; - break; - } - } - - if (escape_char == true) { - // If a escape character is encountered, escape next character in next loop. - if (escape_next == false and *current == '\\') { - escape_next = true; - } else { - escape_next = false; - } - } - - if (current >= end) break; - current++; - } - return current; -} - /** * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization * for integral types. Handles hexadecimal digits, both uppercase and lowercase. @@ -155,7 +96,7 @@ __device__ __inline__ char const* seek_field_end(char const* begin, * @return uint8_t Numeric value of the character, or `0` */ template ::value>* = nullptr> -__device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) +constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 10; @@ -176,7 +117,7 @@ __device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) * @return uint8_t Numeric value of the character, or `0` */ template ::value>* = nullptr> -__device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) +constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; @@ -185,10 +126,7 @@ __device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) } // Converts character to lowercase. -__inline__ __device__ char to_lower(char const c) -{ - return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; -} +constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; } /** * @brief Checks if string is infinity, case insensitive with/without sign @@ -199,7 +137,7 @@ __inline__ __device__ char to_lower(char const c) * @param end Pointer to the first element after the string * @return true if string is valid infinity, else false. */ -__inline__ __device__ bool is_infinity(char const* begin, char const* end) +constexpr bool is_infinity(char const* begin, char const* end) { if (*begin == '-' || *begin == '+') begin++; char const* cinf = "infinity"; @@ -223,9 +161,10 @@ __inline__ __device__ bool is_infinity(char const* begin, char const* end) * @return The parsed and converted value */ template -__inline__ __device__ T parse_numeric(const char* begin, - const char* end, - parse_options_view const& opts) +constexpr T parse_numeric(const char* begin, + const char* end, + parse_options_view const& opts, + T error_result = std::numeric_limits::quiet_NaN()) { T value{}; bool all_digits_valid = true; @@ -281,11 +220,72 @@ __inline__ __device__ T parse_numeric(const char* begin, if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); } } } - if (!all_digits_valid) { return std::numeric_limits::quiet_NaN(); } + if (!all_digits_valid) { return error_result; } return value * sign; } +namespace gpu { +/** + * @brief CUDA kernel iterates over the data until the end of the current field + * + * Also iterates over (one or more) delimiter characters after the field. + * Function applies to formats with field delimiters and line terminators. + * + * @param begin Pointer to the first element of the string + * @param end Pointer to the first element after the string + * @param opts A set of parsing options + * @param escape_char A boolean value to signify whether to consider `\` as escape character or + * just a character. + * + * @return Pointer to the last character in the field, including the + * delimiter(s) following the field data + */ +__device__ __inline__ char const* seek_field_end(char const* begin, + char const* end, + parse_options_view const& opts, + bool escape_char = false) +{ + bool quotation = false; + auto current = begin; + bool escape_next = false; + while (true) { + // Use simple logic to ignore control chars between any quote seq + // Handles nominal cases including doublequotes within quotes, but + // may not output exact failures as PANDAS for malformed fields. + // Check for instances such as "a2\"bc" and "\\" if `escape_char` is true. + + if (*current == opts.quotechar and not escape_next) { + quotation = !quotation; + } else if (!quotation) { + if (*current == opts.delimiter) { + while (opts.multi_delimiter && current < end && *(current + 1) == opts.delimiter) { + ++current; + } + break; + } else if (*current == opts.terminator) { + break; + } else if (*current == '\r' && (current + 1 < end && *(current + 1) == '\n')) { + --end; + break; + } + } + + if (escape_char == true) { + // If a escape character is encountered, escape next character in next loop. + if (escape_next == false and *current == '\\') { + escape_next = true; + } else { + escape_next = false; + } + } + + if (current >= end) break; + current++; + } + return current; +} + /** * @brief Lexicographically compare digits in input against string * representing an integer diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu new file mode 100644 index 00000000000..cd8aae12070 --- /dev/null +++ b/cpp/src/strings/json/json_path.cu @@ -0,0 +1,952 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { + +namespace { + +// debug accessibility + +// change to "\n" and 1 to make output more readable +#define DEBUG_NEWLINE +constexpr int DEBUG_NEWLINE_LEN = 0; + +/** + * @brief Result of calling a parse function. + * + * The primary use of this is to distinguish between "success" and + * "success but no data" return cases. For example, if you are reading the + * values of an array you might call a parse function in a while loop. You + * would want to continue doing this until you either encounter an error (parse_result::ERROR) + * or you get nothing back (parse_result::EMPTY) + */ +enum class parse_result { + ERROR, // failure + SUCCESS, // success + EMPTY, // success, but no data +}; + +/** + * @brief Base parser class inherited by the (device-side) json_state class and + * (host-side) path_state class. + * + * Contains a number of useful utility functions common to parsing json and + * JSONPath strings. + */ +class parser { + protected: + CUDA_HOST_DEVICE_CALLABLE parser() : input(nullptr), input_len(0), pos(nullptr) {} + CUDA_HOST_DEVICE_CALLABLE parser(const char* _input, int64_t _input_len) + : input(_input), input_len(_input_len), pos(_input) + { + parse_whitespace(); + } + + CUDA_HOST_DEVICE_CALLABLE parser(parser const& p) + : input(p.input), input_len(p.input_len), pos(p.pos) + { + } + + CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; } + CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); } + + CUDA_HOST_DEVICE_CALLABLE bool parse_whitespace() + { + while (!eof()) { + if (is_whitespace(*pos)) { + pos++; + } else { + return true; + } + } + return false; + } + + CUDA_HOST_DEVICE_CALLABLE parse_result parse_string(string_view& str, + bool can_be_empty, + char quote) + { + str = string_view(nullptr, 0); + + if (parse_whitespace() && *pos == quote) { + const char* start = ++pos; + while (!eof()) { + if (*pos == quote) { + str = string_view(start, pos - start); + pos++; + return parse_result::SUCCESS; + } + pos++; + } + } + + return can_be_empty ? parse_result::EMPTY : parse_result::ERROR; + } + + // a name means: + // - a string followed by a : + // - no string + CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(string_view& name, + bool can_be_empty, + char quote) + { + if (parse_string(name, can_be_empty, quote) == parse_result::ERROR) { + return parse_result::ERROR; + } + + // if we got a real string, the next char must be a : + if (name.size_bytes() > 0) { + if (!parse_whitespace()) { return parse_result::ERROR; } + if (*pos == ':') { + pos++; + return parse_result::SUCCESS; + } + } + return parse_result::EMPTY; + } + + // numbers, true, false, null. + // this function is not particularly strong. badly formed values will get + // consumed without throwing any errors + CUDA_HOST_DEVICE_CALLABLE parse_result parse_non_string_value(string_view& val) + { + if (!parse_whitespace()) { return parse_result::ERROR; } + + // parse to the end of the value + char const* start = pos; + char const* end = start; + while (!eof(end)) { + char const c = *end; + if (c == ',' || c == '}' || c == ']' || is_whitespace(c)) { break; } + + // illegal chars + if (c == '[' || c == '{' || c == ':' || c == '\"') { return parse_result::ERROR; } + end++; + } + pos = end; + + val = string_view(start, end - start); + + return parse_result::SUCCESS; + } + + protected: + char const* input; + int64_t input_len; + char const* pos; + + private: + CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) { return c <= ' '; } +}; + +/** + * @brief Output buffer object. Used during the preprocess/size-computation step + * and the actual output step. + * + * There is an important distinction between two cases: + * + * - producing no output at all. that is, the query matched nothing in the input. + * - producing empty output. the query matched something in the input, but the + * value of the result is an empty string. + * + * The `has_output` field is the flag which indicates whether or not the output + * from the query should be considered empty or null. + * + */ +struct json_output { + size_t output_max_len; + char* output; + thrust::optional output_len; + + __device__ void add_output(const char* str, size_t len) + { + if (output != nullptr) { memcpy(output + output_len.value_or(0), str, len); } + output_len = output_len.value_or(0) + len; + } + + __device__ void add_output(string_view const& str) { add_output(str.data(), str.size_bytes()); } +}; + +enum json_element_type { NONE, OBJECT, ARRAY, VALUE }; + +/** + * @brief Parsing class that holds the current state of the json to be parse and provides + * functions for navigating through it. + */ +class json_state : private parser { + public: + __device__ json_state() + : parser(), + cur_el_start(nullptr), + cur_el_type(json_element_type::NONE), + parent_el_type(json_element_type::NONE) + { + } + __device__ json_state(const char* _input, int64_t _input_len) + : parser(_input, _input_len), + cur_el_start(nullptr), + cur_el_type(json_element_type::NONE), + parent_el_type(json_element_type::NONE) + { + } + + __device__ json_state(json_state const& j) + : parser(j), + cur_el_start(j.cur_el_start), + cur_el_type(j.cur_el_type), + parent_el_type(j.parent_el_type) + { + } + + // retrieve the entire current element into the output + __device__ parse_result extract_element(json_output* output, bool list_element) + { + char const* start = cur_el_start; + char const* end = start; + + // if we're a value type, do a simple value parse. + if (cur_el_type == VALUE) { + pos = cur_el_start; + if (parse_value() != parse_result::SUCCESS) { return parse_result::ERROR; } + end = pos; + + // SPARK-specific behavior. if this is a non-list-element wrapped in quotes, + // strip them. we may need to make this behavior configurable in some way + // later on. + if (!list_element && *start == '\"' && *(end - 1) == '\"') { + start++; + end--; + } + } + // otherwise, march through everything inside + else { + int obj_count = 0; + int arr_count = 0; + + while (!eof(end)) { + // could do some additional checks here. we know our current + // element type, so we could be more strict on what kinds of + // characters we expect to see. + switch (*end++) { + case '{': obj_count++; break; + case '}': obj_count--; break; + case '[': arr_count++; break; + case ']': arr_count--; break; + default: break; + } + if (obj_count == 0 && arr_count == 0) { break; } + } + if (obj_count > 0 || arr_count > 0) { return parse_result::ERROR; } + pos = end; + } + + // parse trailing , + if (parse_whitespace()) { + if (*pos == ',') { pos++; } + } + + if (output != nullptr) { output->add_output({start, static_cast(end - start)}); } + return parse_result::SUCCESS; + } + + // skip the next element + __device__ parse_result skip_element() { return extract_element(nullptr, false); } + + // advance to the next element + __device__ parse_result next_element() { return next_element_internal(false); } + + // advance inside the current element + __device__ parse_result child_element(json_element_type expected_type) + { + if (expected_type != NONE && cur_el_type != expected_type) { return parse_result::ERROR; } + + // if we succeed, record our parent element type. + auto const prev_el_type = cur_el_type; + auto const result = next_element_internal(true); + if (result == parse_result::SUCCESS) { parent_el_type = prev_el_type; } + return result; + } + + // return the next element that matches the specified name. + __device__ parse_result next_matching_element(string_view const& name, bool inclusive) + { + // if we're not including the current element, skip it + if (!inclusive) { + parse_result result = next_element_internal(false); + if (result != parse_result::SUCCESS) { return result; } + } + // loop until we find a match or there's nothing left + do { + // wildcard matches anything + if (name.size_bytes() == 1 && name.data()[0] == '*') { + return parse_result::SUCCESS; + } else if (cur_el_name == name) { + return parse_result::SUCCESS; + } + + // next + parse_result result = next_element_internal(false); + if (result != parse_result::SUCCESS) { return result; } + } while (1); + + return parse_result::ERROR; + } + + private: + // parse a value - either a string or a number/null/bool + __device__ parse_result parse_value() + { + if (!parse_whitespace()) { return parse_result::ERROR; } + + // string or number? + string_view unused; + return *pos == '\"' ? parse_string(unused, false, '\"') : parse_non_string_value(unused); + } + + __device__ parse_result next_element_internal(bool child) + { + // if we're not getting a child element, skip the current element. + // this will leave pos as the first character -after- the close of + // the current element + if (!child && cur_el_start != nullptr) { + if (skip_element() == parse_result::ERROR) { return parse_result::ERROR; } + cur_el_start = nullptr; + } + // otherwise pos will be at the first character within the current element + + // can only get the child of an object or array. + // this could theoretically be handled as an error, but the evaluators I've found + // seem to treat this as "it's nothing" + if (child && (cur_el_type == VALUE || cur_el_type == NONE)) { return parse_result::EMPTY; } + + // what's next + if (!parse_whitespace()) { return parse_result::EMPTY; } + // if we're closing off a parent element, we're done + char const c = *pos; + if (c == ']' || c == '}') { return parse_result::EMPTY; } + + // if we're not accessing elements of an array, check for name. + bool const array_access = + (cur_el_type == ARRAY && child) || (parent_el_type == ARRAY && !child); + if (!array_access && parse_name(cur_el_name, true, '\"') == parse_result::ERROR) { + return parse_result::ERROR; + } + + // element type + if (!parse_whitespace()) { return parse_result::EMPTY; } + switch (*pos++) { + case '[': cur_el_type = ARRAY; break; + case '{': cur_el_type = OBJECT; break; + + case ',': + case ':': + case '\'': return parse_result::ERROR; + + // value type + default: cur_el_type = VALUE; break; + } + + // the start of the current element is always at the value, not the name + cur_el_start = pos - 1; + return parse_result::SUCCESS; + } + + const char* cur_el_start; // pointer to the first character of the -value- of the current + // element - not the name + string_view cur_el_name; // name of the current element (if applicable) + json_element_type cur_el_type; // type of the current element + json_element_type parent_el_type; // parent element type +}; + +enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END }; + +/** + * @brief A "command" operator used to query a json string. A full query is + * an array of these operators applied to the incoming json string, + */ +struct path_operator { + CUDA_HOST_DEVICE_CALLABLE path_operator() + : type(path_operator_type::ERROR), index(-1), expected_type{NONE} + { + } + CUDA_HOST_DEVICE_CALLABLE path_operator(path_operator_type _type, + json_element_type _expected_type = NONE) + : type(_type), index(-1), expected_type{_expected_type} + { + } + + path_operator_type type; // operator type + // the expected element type we're applying this operation to. + // for example: + // - you cannot retrieve a subscripted field (eg [5]) from an object. + // - you cannot retrieve a field by name (eg .book) from an array. + // - you -can- use .* for both arrays and objects + // a value of NONE imples any type accepted + json_element_type expected_type; // the expected type of the element we're working with + string_view name; // name to match against (if applicable) + int index; // index for subscript operator +}; + +/** + * @brief Parsing class that holds the current state of the JSONPath string to be parsed + * and provides functions for navigating through it. This is only called on the host + * during the preprocess step which builds a command buffer that the gpu uses. + */ +class path_state : private parser { + public: + path_state(const char* _path, size_t _path_len) : parser(_path, _path_len) {} + + // get the next operator in the JSONPath string + path_operator get_next_operator() + { + if (eof()) { return {path_operator_type::END}; } + + switch (*pos++) { + case '$': return {path_operator_type::ROOT}; + + case '.': { + path_operator op; + string_view term{".[", 2}; + if (parse_path_name(op.name, term)) { + // this is another potential use case for __SPARK_BEHAVIORS / configurability + // Spark currently only handles the wildcard operator inside [*], it does + // not handle .* + if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') { + op.type = path_operator_type::CHILD_WILDCARD; + op.expected_type = NONE; + } else { + op.type = path_operator_type::CHILD; + op.expected_type = OBJECT; + } + return op; + } + } break; + + // 3 ways this can be used + // indices: [0] + // name: ['book'] + // wildcard: [*] + case '[': { + path_operator op; + string_view term{"]", 1}; + bool const is_string = *pos == '\'' ? true : false; + if (parse_path_name(op.name, term)) { + pos++; + if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') { + op.type = path_operator_type::CHILD_WILDCARD; + op.expected_type = NONE; + } else { + if (is_string) { + op.type = path_operator_type::CHILD; + op.expected_type = OBJECT; + } else { + op.type = path_operator_type::CHILD_INDEX; + op.index = cudf::io::parse_numeric( + op.name.data(), op.name.data() + op.name.size_bytes(), json_opts, -1); + CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath"); + op.expected_type = ARRAY; + } + } + return op; + } + } break; + + // wildcard operator + case '*': { + pos++; + return path_operator{path_operator_type::CHILD_WILDCARD}; + } break; + + default: CUDF_FAIL("Unrecognized JSONPath operator"); break; + } + return {path_operator_type::ERROR}; + } + + private: + cudf::io::parse_options_view json_opts{',', '\n', '\"', '.'}; + + bool parse_path_name(string_view& name, string_view const& terminators) + { + switch (*pos) { + case '*': + name = string_view(pos, 1); + pos++; + break; + + case '\'': + if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; } + break; + + default: { + size_t const chars_left = input_len - (pos - input); + char const* end = std::find_first_of( + pos, pos + chars_left, terminators.data(), terminators.data() + terminators.size_bytes()); + if (end) { + name = string_view(pos, end - pos); + pos = end; + } else { + name = string_view(pos, chars_left); + pos = input + input_len; + } + break; + } + } + + // an empty name is not valid + CUDF_EXPECTS(name.size_bytes() > 0, "Invalid empty name in JSONPath query string"); + + return true; + } +}; + +/** + * @brief Preprocess the incoming JSONPath string on the host to generate a + * command buffer for use by the GPU. + * + * @param json_path The incoming json path + * @param stream Cuda stream to perform any gpu actions on + * @returns A pair containing the command buffer, and maximum stack depth required. + */ +std::pair>, int> build_command_buffer( + cudf::string_scalar const& json_path, rmm::cuda_stream_view stream) +{ + std::string h_json_path = json_path.to_string(stream); + path_state p_state(h_json_path.data(), static_cast(h_json_path.size())); + + std::vector h_operators; + + path_operator op; + int max_stack_depth = 1; + do { + op = p_state.get_next_operator(); + if (op.type == path_operator_type::ERROR) { + CUDF_FAIL("Encountered invalid JSONPath input string"); + } + if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; } + // convert pointer to device pointer + if (op.name.size_bytes() > 0) { + op.name = + string_view(json_path.data() + (op.name.data() - h_json_path.data()), op.name.size_bytes()); + } + if (op.type == path_operator_type::ROOT) { + CUDF_EXPECTS(h_operators.size() == 0, "Root operator ($) can only exist at the root"); + } + // if we havent' gotten a root operator to start, and we're not empty, quietly push a + // root operator now. + if (h_operators.size() == 0 && op.type != path_operator_type::ROOT && + op.type != path_operator_type::END) { + h_operators.push_back(path_operator{path_operator_type::ROOT}); + } + h_operators.push_back(op); + } while (op.type != path_operator_type::END); + + auto const is_empty = h_operators.size() == 1 && h_operators[0].type == path_operator_type::END; + return is_empty + ? std::make_pair(thrust::nullopt, 0) + : std::make_pair( + thrust::make_optional(cudf::detail::make_device_uvector_sync(h_operators, stream)), + max_stack_depth); +} + +#define PARSE_TRY(_x) \ + do { \ + last_result = _x; \ + if (last_result == parse_result::ERROR) { return parse_result::ERROR; } \ + } while (0) + +/** + * @brief Parse a single json string using the provided command buffer + * + * @param j_state The incoming json string and associated parser + * @param commands The command buffer to be applied to the string. Always ends with a + * path_operator_type::END + * @param output Buffer user to store the results of the query + * @returns A result code indicating success/fail/empty. + */ +template +__device__ parse_result parse_json_path(json_state& j_state, + path_operator const* commands, + json_output& output) +{ + // manually maintained context stack in lieu of calling parse_json_path recursively. + struct context { + json_state j_state; + path_operator const* commands; + bool list_element; + bool state_flag; + }; + context stack[max_command_stack_depth]; + int stack_pos = 0; + auto push_context = [&stack, &stack_pos](json_state const& _j_state, + path_operator const* _commands, + bool _list_element = false, + bool _state_flag = false) { + if (stack_pos == max_command_stack_depth - 1) { return false; } + stack[stack_pos++] = context{_j_state, _commands, _list_element, _state_flag}; + return true; + }; + auto pop_context = [&stack, &stack_pos](context& c) { + if (stack_pos > 0) { + c = stack[--stack_pos]; + return true; + } + return false; + }; + push_context(j_state, commands, false); + + parse_result last_result = parse_result::SUCCESS; + context ctx; + int element_count = 0; + while (pop_context(ctx)) { + path_operator op = *ctx.commands; + + switch (op.type) { + // whatever the first object is + case path_operator_type::ROOT: + PARSE_TRY(ctx.j_state.next_element()); + push_context(ctx.j_state, ctx.commands + 1); + break; + + // .name + // ['name'] + // [1] + // will return a single thing + case path_operator_type::CHILD: { + PARSE_TRY(ctx.j_state.child_element(op.expected_type)); + if (last_result == parse_result::SUCCESS) { + PARSE_TRY(ctx.j_state.next_matching_element(op.name, true)); + if (last_result == parse_result::SUCCESS) { + push_context(ctx.j_state, ctx.commands + 1, ctx.list_element); + } + } + } break; + + // .* + // [*] + // will return an array of things + case path_operator_type::CHILD_WILDCARD: { + // if we're on the first element of this wildcard + if (!ctx.state_flag) { + // we will only ever be returning 1 array + if (!ctx.list_element) { output.add_output({"[" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); } + + // step into the child element + PARSE_TRY(ctx.j_state.child_element(op.expected_type)); + if (last_result == parse_result::EMPTY) { + if (!ctx.list_element) { + output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + last_result = parse_result::SUCCESS; + break; + } + + // first element + PARSE_TRY(ctx.j_state.next_matching_element({"*", 1}, true)); + if (last_result == parse_result::EMPTY) { + if (!ctx.list_element) { + output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + last_result = parse_result::SUCCESS; + break; + } + + // re-push ourselves + push_context(ctx.j_state, ctx.commands, ctx.list_element, true); + // push the next command + push_context(ctx.j_state, ctx.commands + 1, true); + } else { + // next element + PARSE_TRY(ctx.j_state.next_matching_element({"*", 1}, false)); + if (last_result == parse_result::EMPTY) { + if (!ctx.list_element) { + output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + last_result = parse_result::SUCCESS; + break; + } + + // re-push ourselves + push_context(ctx.j_state, ctx.commands, ctx.list_element, true); + // push the next command + push_context(ctx.j_state, ctx.commands + 1, true); + } + } break; + + // [0] + // [1] + // etc + // returns a single thing + case path_operator_type::CHILD_INDEX: { + PARSE_TRY(ctx.j_state.child_element(op.expected_type)); + if (last_result == parse_result::SUCCESS) { + string_view const any{"*", 1}; + PARSE_TRY(ctx.j_state.next_matching_element(any, true)); + if (last_result == parse_result::SUCCESS) { + int idx; + for (idx = 1; idx <= op.index; idx++) { + PARSE_TRY(ctx.j_state.next_matching_element(any, false)); + if (last_result == parse_result::EMPTY) { break; } + } + // if we didn't end up at the index we requested, this is an invalid index + if (idx - 1 != op.index) { return parse_result::ERROR; } + push_context(ctx.j_state, ctx.commands + 1, ctx.list_element); + } + } + } break; + + // some sort of error. + case path_operator_type::ERROR: return parse_result::ERROR; break; + + // END case + default: { + if (ctx.list_element && element_count > 0) { + output.add_output({"," DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + PARSE_TRY(ctx.j_state.extract_element(&output, ctx.list_element)); + if (ctx.list_element && last_result != parse_result::EMPTY) { element_count++; } + } break; + } + } + + return parse_result::SUCCESS; +} + +// hardcoding this for now. to reach a stack depth of 8 would require +// a JSONPath containing 7 nested wildcards so this is probably reasonable. +constexpr int max_command_stack_depth = 8; + +/** + * @brief Parse a single json string using the provided command buffer + * + * This function exists primarily as a shim for debugging purposes. + * + * @param input The incoming json string + * @param input_len Size of the incoming json string + * @param commands The command buffer to be applied to the string. Always ends with a + * path_operator_type::END + * @param out_buf Buffer user to store the results of the query (nullptr in the size computation + * step) + * @param out_buf_size Size of the output buffer + * @returns A pair containing the result code the output buffer. + */ +__device__ thrust::pair get_json_object_single( + char const* input, + size_t input_len, + path_operator const* const commands, + char* out_buf, + size_t out_buf_size) +{ + json_state j_state(input, input_len); + json_output output{out_buf_size, out_buf}; + + auto const result = parse_json_path(j_state, commands, output); + + return {result, output}; +} + +/** + * @brief Kernel for running the JSONPath query. + * + * This kernel operates in a 2-pass way. On the first pass, it computes + * output sizes. On the second pass it fills in the provided output buffers + * (chars and validity) + * + * @param col Device view of the incoming string + * @param commands JSONPath command buffer + * @param output_offsets Buffer used to store the string offsets for the results of the query + * @param out_buf Buffer used to store the results of the query + * @param out_validity Output validity buffer + * @param out_valid_count Output count of # of valid bits + */ +template +__launch_bounds__(block_size) __global__ + void get_json_object_kernel(column_device_view col, + path_operator const* const commands, + offset_type* output_offsets, + thrust::optional out_buf, + thrust::optional out_validity, + thrust::optional out_valid_count) +{ + size_type tid = threadIdx.x + (blockDim.x * blockIdx.x); + size_type stride = blockDim.x * gridDim.x; + + if (out_valid_count.has_value()) { *(out_valid_count.value()) = 0; } + size_type warp_valid_count{0}; + + auto active_threads = __ballot_sync(0xffffffff, tid < col.size()); + while (tid < col.size()) { + bool is_valid = false; + string_view const str = col.element(tid); + size_type output_size = 0; + if (str.size_bytes() > 0) { + char* dst = out_buf.has_value() ? out_buf.value() + output_offsets[tid] : nullptr; + size_t const dst_size = + out_buf.has_value() ? output_offsets[tid + 1] - output_offsets[tid] : 0; + + parse_result result; + json_output out; + thrust::tie(result, out) = + get_json_object_single(str.data(), str.size_bytes(), commands, dst, dst_size); + output_size = out.output_len.value_or(0); + if (out.output_len.has_value() && result == parse_result::SUCCESS) { is_valid = true; } + } + + // filled in only during the precompute step. during the compute step, the offsets + // are fed back in so we do -not- want to write them out + if (!out_buf.has_value()) { output_offsets[tid] = static_cast(output_size); } + + // validity filled in only during the output step + if (out_validity.has_value()) { + uint32_t mask = __ballot_sync(active_threads, is_valid); + // 0th lane of the warp writes the validity + if (!(tid % cudf::detail::warp_size)) { + out_validity.value()[cudf::word_index(tid)] = mask; + warp_valid_count += __popc(mask); + } + } + + tid += stride; + active_threads = __ballot_sync(active_threads, tid < col.size()); + } + + // sum the valid counts across the whole block + if (out_valid_count) { + size_type block_valid_count = + cudf::detail::single_lane_block_sum_reduce(warp_valid_count); + if (threadIdx.x == 0) { atomicAdd(out_valid_count.value(), block_valid_count); } + } +} + +/** + * @copydoc cudf::strings::detail::get_json_object + */ +std::unique_ptr get_json_object(cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // preprocess the json_path into a command buffer + auto preprocess = build_command_buffer(json_path, stream); + CUDF_EXPECTS(std::get<1>(preprocess) <= max_command_stack_depth, + "Encountered JSONPath string that is too complex"); + + // allocate output offsets buffer. + auto offsets = cudf::make_fixed_width_column( + data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr); + cudf::mutable_column_view offsets_view(*offsets); + + // if the query is empty, return a string column containing all nulls + if (!std::get<0>(preprocess).has_value()) { + return std::make_unique( + data_type{type_id::STRING}, + col.size(), + rmm::device_buffer{0, stream, mr}, // no data + cudf::detail::create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr), + col.size()); // null count + } + + constexpr int block_size = 512; + cudf::detail::grid_1d const grid{col.size(), block_size}; + + auto cdv = column_device_view::create(col.parent(), stream); + + // preprocess sizes (returned in the offsets buffer) + get_json_object_kernel + <<>>( + *cdv, + std::get<0>(preprocess).value().data(), + offsets_view.head(), + thrust::nullopt, + thrust::nullopt, + thrust::nullopt); + + // convert sizes to offsets + thrust::exclusive_scan(rmm::exec_policy(stream), + offsets_view.head(), + offsets_view.head() + col.size() + 1, + offsets_view.head(), + 0); + size_type const output_size = + cudf::detail::get_value(offsets_view, col.size(), stream); + + // allocate output string column + auto chars = cudf::make_fixed_width_column( + data_type{type_id::INT8}, output_size, mask_state::UNALLOCATED, stream, mr); + + // potential optimization : if we know that all outputs are valid, we could skip creating + // the validity mask altogether + rmm::device_buffer validity = + cudf::detail::create_null_mask(col.size(), mask_state::UNINITIALIZED, stream, mr); + + // compute results + cudf::mutable_column_view chars_view(*chars); + rmm::device_scalar d_valid_count{0, stream}; + get_json_object_kernel + <<>>( + *cdv, + std::get<0>(preprocess).value().data(), + offsets_view.head(), + chars_view.head(), + static_cast(validity.data()), + d_valid_count.data()); + + return make_strings_column(col.size(), + std::move(offsets), + std::move(chars), + col.size() - d_valid_count.value(), + std::move(validity), + stream, + mr); +} + +} // namespace +} // namespace detail + +/** + * @copydoc cudf::strings::get_json_object + */ +std::unique_ptr get_json_object(cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::get_json_object(col, json_path, 0, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 082f039054e..f9904dda49e 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -334,6 +334,7 @@ ConfigureTest(STRINGS_TEST strings/hash_string.cu strings/integers_tests.cu strings/ipv4_tests.cpp + strings/json_tests.cpp strings/pad_tests.cpp strings/replace_regex_tests.cpp strings/replace_tests.cpp diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp new file mode 100644 index 00000000000..44eb35d4163 --- /dev/null +++ b/cpp/tests/strings/json_tests.cpp @@ -0,0 +1,761 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +// reference: https://jsonpath.herokuapp.com/ + +// clang-format off +std::string json_string{ + "{" + "\"store\": {""\"book\": [" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "\"bicycle\": {" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "}," + "\"expensive\": 10" + "}" +}; +// clang-format on + +std::unique_ptr drop_whitespace(cudf::column_view const& col) +{ + cudf::test::strings_column_wrapper whitespace{"\n", "\r", "\t"}; + cudf::test::strings_column_wrapper repl{"", "", ""}; + + cudf::strings_column_view strings(col); + cudf::strings_column_view targets(whitespace); + cudf::strings_column_view replacements(repl); + return cudf::strings::replace(strings, targets, replacements); +} + +struct JsonTests : public cudf::test::BaseFixture { +}; + +TEST_F(JsonTests, GetJsonObjectRootOp) +{ + // root + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + auto expected = drop_whitespace(input); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); +} + +TEST_F(JsonTests, GetJsonObjectChildOp) +{ + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "{" + "\"book\": [" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "\"bicycle\": {" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "}" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "[" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectWildcardOp) +{ + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.*"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "[" + "[" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "{" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "]" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("*"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "[" + "{" + "\"book\": [" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "\"bicycle\": {" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "}," + "10" + "]" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectSubscriptOp) +{ + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[2]"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store['bicycle']"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "{" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*]"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "[" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectFilter) +{ + // queries that result in filtering/collating results (mostly meaning - generates new + // json instead of just returning parts of the existing string + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*]['isbn']"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{"[\"0-553-21311-3\",\"0-395-19395-8\"]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*].category"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{ + "[\"reference\",\"fiction\",\"fiction\",\"fiction\"]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*].title"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{ + "[\"Sayings of the Century\",\"Sword of Honour\",\"Moby Dick\",\"The Lord of the Rings\"]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book.*.price"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{"[8.95,12.99,8.99,22.99]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + // spark behavioral difference. + // standard: "fiction" + // spark: fiction + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[2].category"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{"fiction"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectNullInputs) +{ + { + std::string str("{\"a\" : \"b\"}"); + cudf::test::strings_column_wrapper input({str, str, str, str}, {1, 0, 1, 0}); + + std::string json_path("$.a"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw({"b", "", "b", ""}, {1, 0, 1, 0}); + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectEmptyQuery) +{ + // empty query -> null + { + cudf::test::strings_column_wrapper input{"{\"a\" : \"b\"}"}; + std::string json_path(""); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} + +TEST_F(JsonTests, GetJsonObjectEmptyInputsAndOutputs) +{ + // empty input -> null + { + cudf::test::strings_column_wrapper input{""}; + std::string json_path("$"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // slightly different from "empty output". in this case, we're + // returning something, but it happens to be empty. so we expect + // a valid, but empty row + { + cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"}; + std::string json_path("$.store.bicycle"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} + +// badly formed JSONpath strings +TEST_F(JsonTests, GetJsonObjectIllegalQuery) +{ + // can't have more than one root operator, or a root operator anywhere other + // than the beginning + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$$"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // invalid index + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[auh46h-]"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // invalid index + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[[]]"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // negative index + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[-1]"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // child operator with no name specified + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("."); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("]["); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("6hw6,56i3"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } +} + +// queries that are legal, but reference invalid parts of the input +TEST_F(JsonTests, GetJsonObjectInvalidQuery) +{ + // non-existent field + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[*].c"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // non-existent field + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[*].c[2]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // non-existent field + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book.price"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // out of bounds index + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[4]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} + +TEST_F(JsonTests, MixedOutput) +{ + // various queries on: + // clang-format off + std::vector input_strings { + "{\"a\": {\"b\" : \"c\"}}", + + "{" + "\"a\": {\"b\" : \"c\"}," + "\"d\": [{\"e\":123}, {\"f\":-10}]" + "}", + + "{" + "\"b\": 123" + "}", + + "{" + "\"a\": [\"y\",500]" + "}", + + "{" + "\"a\": \"\"" + "}", + + "{" + "\"a\": {" + "\"z\": {\"i\": 10, \"j\": 100}," + "\"b\": [\"c\",null,true,-1]" + "}" + "}" + }; + // clang-format on + cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end()); + { + std::string json_path("$.a"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "{\"b\" : \"c\"}", + "{\"b\" : \"c\"}", + "", + "[\"y\",500]", + "", + "{" + "\"z\": {\"i\": 10, \"j\": 100}," + "\"b\": [\"c\",null,true,-1]" + "}" + }, + {1, 1, 0, 1, 1, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a[1]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "", + "", + "", + "500", + "", + "", + }, + {0, 0, 0, 1, 0, 0}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a.b"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "c", + "c", + "", + "", + "", + "[\"c\",null,true,-1]"}, + {1, 1, 0, 0, 0, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a[*]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "[\"c\"]", + "[\"c\"]", + "", + "[\"y\",500]", + "[]", + "[" + "{\"i\": 10, \"j\": 100}," + "[\"c\",null,true,-1]" + "]" }, + {1, 1, 0, 1, 1, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a.b[*]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "[]", + "[]", + "", + "", + "", + "[\"c\",null,true,-1]"}, + {1, 1, 0, 0, 0, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index 78a67464654..a54c86405a5 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -71,7 +71,7 @@ struct column_property_comparator { // equivalent, but not exactly equal columns can have a different number of children if their // sizes are both 0. Specifically, empty string columns may or may not have children. - if (check_exact_equality || lhs.size() > 0) { + if (check_exact_equality || (lhs.size() > 0 && lhs.null_count() < lhs.size())) { EXPECT_EQ(lhs.num_children(), rhs.num_children()); } } diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 5d869ab75fb..402c64dd83d 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2083,6 +2083,23 @@ public final ColumnVector substring(ColumnView start, ColumnView end) { return new ColumnVector(substringColumn(getNativeView(), start.getNativeView(), end.getNativeView())); } + /** + * Apply a JSONPath string to all rows in an input strings column. + * + * Applies a JSONPath string to an incoming strings column where each row in the column + * is a valid json string. The output is returned by row as a strings column. + * + * For reference, https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html + * Note: Only implements the operators: $ . [] * + * + * @param path The JSONPath string to be applied to each row + * @return new strings ColumnVector containing the retrieved json object strings + */ + public final ColumnVector getJSONObject(Scalar path) { + assert(type.equals(DType.STRING)) : "column type must be a String"; + return new ColumnVector(getJSONObject(getNativeView(), path.getScalarHandle())); + } + /** * Returns a new strings column where target string within each string is replaced with the specified * replacement string. @@ -2649,6 +2666,8 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) { */ private static native long stringTimestampToTimestamp(long viewHandle, int unit, String format); + private static native long getJSONObject(long viewHandle, long scalarHandle) throws CudfException; + /** * Native method to parse and convert a timestamp column vector to string column vector. A unix * timestamp is a long value representing how many units since 1970-01-01 00:00:00:000 in either diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index dc1acc50b5f..cec3a1a92a6 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -65,6 +66,8 @@ #include "cudf_jni_apis.hpp" #include "dtype_utils.hpp" +#include "jni.h" +#include "jni_utils.hpp" namespace { @@ -1835,4 +1838,24 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv } CATCH_STD(env, 0) } + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env, jclass, + jlong j_view_handle, jlong j_scalar_handle) { + + JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0); + JNI_NULL_CHECK(env, j_scalar_handle, "path cannot be null", 0); + + try { + cudf::jni::auto_set_device(env); + cudf::column_view* n_column_view = reinterpret_cast(j_view_handle); + cudf::strings_column_view n_strings_col_view(*n_column_view); + cudf::string_scalar *n_scalar_path = reinterpret_cast(j_scalar_handle); + + auto result = cudf::strings::get_json_object(n_strings_col_view, *n_scalar_path); + + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0) + +} } // extern "C" diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index fe1cba5ceb1..ce2c287a1c8 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4132,6 +4132,50 @@ void testCopyToColumnVector() { } } + @Test + void testGetJSONObject() { + String jsonString = "{ \"store\": {\n" + + " \"book\": [\n" + + " { \"category\": \"reference\",\n" + + " \"author\": \"Nigel Rees\",\n" + + " \"title\": \"Sayings of the Century\",\n" + + " \"price\": 8.95\n" + + " },\n" + + " { \"category\": \"fiction\",\n" + + " \"author\": \"Evelyn Waugh\",\n" + + " \"title\": \"Sword of Honour\",\n" + + " \"price\": 12.99\n" + + " },\n" + + " { \"category\": \"fiction\",\n" + + " \"author\": \"Herman Melville\",\n" + + " \"title\": \"Moby Dick\",\n" + + " \"isbn\": \"0-553-21311-3\",\n" + + " \"price\": 8.99\n" + + " },\n" + + " { \"category\": \"fiction\",\n" + + " \"author\": \"J. R. R. Tolkien\",\n" + + " \"title\": \"The Lord of the Rings\",\n" + + " \"isbn\": \"0-395-19395-8\",\n" + + " \"price\": 22.99\n" + + " }\n" + + " ],\n" + + " \"bicycle\": {\n" + + " \"color\": \"red\",\n" + + " \"price\": 19.95\n" + + " }\n" + + " }\n" + + "}"; + + try (ColumnVector json = ColumnVector.fromStrings(jsonString, jsonString); + ColumnVector expectedAuthors = ColumnVector.fromStrings("[\"Nigel Rees\",\"Evelyn " + + "Waugh\",\"Herman Melville\",\"J. R. R. Tolkien\"]", "[\"Nigel Rees\",\"Evelyn " + + "Waugh\",\"Herman Melville\",\"J. R. R. Tolkien\"]"); + Scalar path = Scalar.fromString("$.store.book[*].author"); + ColumnVector gotAuthors = json.getJSONObject(path)) { + assertColumnsAreEqual(expectedAuthors, gotAuthors); + } + } + @Test void testMakeStructEmpty() { final int numRows = 10; From c05dbed52fdd15757e40463a64ce757d6cd21b46 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 31 Mar 2021 12:32:09 -0500 Subject: [PATCH 10/14] Add column names validation in parquet writer (#7786) Fixes: #7738 Parquet writer requires all column names to be of string types, added a validation similar to that of pandas. Authors: - GALI PREM SAGAR (@galipremsagar) Approvers: - Michael Wang (@isVoid) - Keith Kraus (@kkraus14) URL: https://github.com/rapidsai/cudf/pull/7786 --- python/cudf/cudf/_lib/parquet.pyx | 3 +++ python/cudf/cudf/tests/test_parquet.py | 14 +++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index d8b4fbbbe4b..4ea2adec23a 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -312,6 +312,9 @@ cpdef write_parquet( num_index_cols_meta = 0 for i, name in enumerate(table._column_names, num_index_cols_meta): + if not isinstance(name, str): + raise ValueError("parquet must have string column names") + tbl_meta.get().column_metadata[i].set_name(name.encode()) _set_col_metadata( table[name]._column, tbl_meta.get().column_metadata[i] diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index fe418d1ade1..4781ff995b0 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -19,7 +19,7 @@ import cudf from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata from cudf.tests import dataset_generator as dg -from cudf.tests.utils import assert_eq +from cudf.tests.utils import assert_eq, assert_exceptions_equal @pytest.fixture(scope="module") @@ -1937,3 +1937,15 @@ def test_parquet_writer_decimal(tmpdir): got = pd.read_parquet(fname) assert_eq(gdf, got) + + +def test_parquet_writer_column_validation(): + df = cudf.DataFrame({1: [1, 2, 3], "1": ["a", "b", "c"]}) + pdf = df.to_pandas() + + assert_exceptions_equal( + lfunc=df.to_parquet, + rfunc=pdf.to_parquet, + lfunc_args_and_kwargs=(["cudf.parquet"],), + rfunc_args_and_kwargs=(["pandas.parquet"],), + ) From acb69858808ff50ec2b57bde6fc5b4920732e31a Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 31 Mar 2021 13:39:31 -0500 Subject: [PATCH 11/14] Turn on NVTX by default in java build (#7761) Investigation was done under https://github.com/NVIDIA/spark-rapids/issues/1721 and it showed no significant performance difference with NVTX on. It would make it a lot easier if this was on by default because it allows customers and developers to get trace with the same jar without having to go off and build a new CUDF version. So this PR turns it on by default and adds in reading from environment variable if we need to change in the future from build scripts. Authors: - Thomas Graves (@tgravescs) Approvers: - Jason Lowe (@jlowe) - Robert (Bobby) Evans (@revans2) URL: https://github.com/rapidsai/cudf/pull/7761 --- java/ci/build-in-docker.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh index eee943cde38..b2d0b066ce7 100755 --- a/java/ci/build-in-docker.sh +++ b/java/ci/build-in-docker.sh @@ -24,6 +24,7 @@ SKIP_JAVA_TESTS=${SKIP_JAVA_TESTS:-true} BUILD_CPP_TESTS=${BUILD_CPP_TESTS:-OFF} ENABLE_PTDS=${ENABLE_PTDS:-ON} RMM_LOGGING_LEVEL=${RMM_LOGGING_LEVEL:-OFF} +ENABLE_NVTX=${ENABLE_NVTX:-ON} OUT=${OUT:-out} SIGN_FILE=$1 @@ -35,6 +36,7 @@ echo "SIGN_FILE: $SIGN_FILE,\ SKIP_JAVA_TESTS: $SKIP_JAVA_TESTS,\ BUILD_CPP_TESTS: $BUILD_CPP_TESTS,\ ENABLED_PTDS: $ENABLE_PTDS,\ + ENABLE_NVTX: $ENABLE_NVTX,\ RMM_LOGGING_LEVEL: $RMM_LOGGING_LEVEL,\ OUT_PATH: $OUT_PATH" @@ -51,7 +53,7 @@ export PATH=/usr/local/cmake-3.19.0-Linux-x86_64/bin:$PATH rm -rf $WORKSPACE/cpp/build mkdir -p $WORKSPACE/cpp/build cd $WORKSPACE/cpp/build -cmake .. -DUSE_NVTX=OFF -DCUDF_USE_ARROW_STATIC=ON -DBoost_USE_STATIC_LIBS=ON -DBUILD_TESTS=$SKIP_CPP_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL +cmake .. -DUSE_NVTX=$ENABLE_NVTX -DCUDF_USE_ARROW_STATIC=ON -DBoost_USE_STATIC_LIBS=ON -DBUILD_TESTS=$SKIP_CPP_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL make -j$PARALLEL_LEVEL make install DESTDIR=$INSTALL_PREFIX From 4d6ea76be7b98926b09b93d5f4a309258c8843e4 Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Wed, 31 Mar 2021 13:36:33 -0700 Subject: [PATCH 12/14] add copy methods in Java memory buffer (#7791) This should simplify the code in the rapids shuffle manager (see https://github.com/NVIDIA/spark-rapids/pull/2050). @jlowe @abellina @revans2 Authors: - Rong Ou (https://github.com/rongou) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/7791 --- .../java/ai/rapids/cudf/MemoryBuffer.java | 33 ++++ .../java/ai/rapids/cudf/MemoryBufferTest.java | 171 ++++++++++++++++++ 2 files changed, 204 insertions(+) create mode 100644 java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java diff --git a/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java index a1be9b561a0..9f0d9a451c0 100644 --- a/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java +++ b/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java @@ -146,6 +146,39 @@ public final long getAddress() { return address; } + /** + * Copy a subset of src to this buffer starting at destOffset using the specified CUDA stream. + * The copy has completed when this returns, but the memory copy could overlap with + * operations occurring on other streams. + * @param destOffset the offset in this to start copying from. + * @param src what to copy from + * @param srcOffset offset into src to start out + * @param length how many bytes to copy + * @param stream CUDA stream to use + */ + public final void copyFromMemoryBuffer( + long destOffset, MemoryBuffer src, long srcOffset, long length, Cuda.Stream stream) { + addressOutOfBoundsCheck(address + destOffset, length, "copy range dest"); + src.addressOutOfBoundsCheck(src.address + srcOffset, length, "copy range src"); + Cuda.memcpy(address + destOffset, src.address + srcOffset, length, CudaMemcpyKind.DEFAULT, stream); + } + + /** + * Copy a subset of src to this buffer starting at destOffset using the specified CUDA stream. + * The copy is async and may not have completed when this returns. + * @param destOffset the offset in this to start copying from. + * @param src what to copy from + * @param srcOffset offset into src to start out + * @param length how many bytes to copy + * @param stream CUDA stream to use + */ + public final void copyFromMemoryBufferAsync( + long destOffset, MemoryBuffer src, long srcOffset, long length, Cuda.Stream stream) { + addressOutOfBoundsCheck(address + destOffset, length, "copy range dest"); + src.addressOutOfBoundsCheck(src.address + srcOffset, length, "copy range src"); + Cuda.asyncMemcpy(address + destOffset, src.address + srcOffset, length, CudaMemcpyKind.DEFAULT, stream); + } + /** * Slice off a part of the buffer. Note that this is a zero copy operation and all * slices must be closed along with the original buffer before the memory is released. diff --git a/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java b/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java new file mode 100644 index 00000000000..df710c71f63 --- /dev/null +++ b/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java @@ -0,0 +1,171 @@ +/* + * + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +public class MemoryBufferTest extends CudfTestBase { + private static final byte[] BYTES = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + private static final byte[] EXPECTED = {0, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + @Test + public void testAddressOutOfBoundsExceptionWhenCopying() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(-1, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(16, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, -1, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, 16, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, 0, -1, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, 0, 17, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(1, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, 1, 16, Cuda.DEFAULT_STREAM)); + } + } + + @Test + public void testAddressOutOfBoundsExceptionWhenCopyingAsync() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(-1, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(16, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, -1, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, 16, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, 0, -1, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, 0, 17, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(1, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, 1, 16, Cuda.DEFAULT_STREAM)); + } + } + + @Test + public void testCopyingFromDeviceToDevice() { + try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16); + DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) { + in.setBytes(0, BYTES, 0, 16); + from.copyFromHostBuffer(in); + to.copyFromMemoryBuffer(0, from, 0, 16, Cuda.DEFAULT_STREAM); + to.copyFromMemoryBuffer(1, from, 2, 3, Cuda.DEFAULT_STREAM); + out.copyFromDeviceBuffer(to); + verifyOutput(out); + } + } + + @Test + public void testCopyingFromDeviceToDeviceAsync() { + try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16); + DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) { + in.setBytes(0, BYTES, 0, 16); + from.copyFromHostBuffer(in); + to.copyFromMemoryBufferAsync(0, from, 0, 16, Cuda.DEFAULT_STREAM); + to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM); + out.copyFromDeviceBufferAsync(to, Cuda.DEFAULT_STREAM); + Cuda.DEFAULT_STREAM.sync(); + verifyOutput(out); + } + } + + @Test + public void testCopyingFromHostToHost() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + from.setBytes(0, BYTES, 0, 16); + to.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBuffer(1, from, 2, 3, Cuda.DEFAULT_STREAM); + verifyOutput(to); + } + } + + @Test + public void testCopyingFromHostToHostAsync() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + from.setBytes(0, BYTES, 0, 16); + to.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM); + verifyOutput(to); + } + } + + @Test + public void testCopyingFromHostToDevice() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) { + from.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBuffer(0, from, 0, 16, Cuda.DEFAULT_STREAM); + to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM); + out.copyFromDeviceBuffer(to); + verifyOutput(out); + } + } + + @Test + public void testCopyingFromHostToDeviceAsync() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) { + from.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBufferAsync(0, from, 0, 16, Cuda.DEFAULT_STREAM); + to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM); + out.copyFromDeviceBufferAsync(to, Cuda.DEFAULT_STREAM); + Cuda.DEFAULT_STREAM.sync(); + verifyOutput(out); + } + } + + @Test + public void testCopyingFromDeviceToHost() { + try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + in.setBytes(0, BYTES, 0, 16); + from.copyFromHostBuffer(in); + to.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBuffer(1, from, 2, 3, Cuda.DEFAULT_STREAM); + verifyOutput(to); + } + } + + @Test + public void testCopyingFromDeviceToHostAsync() { + try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + in.setBytes(0, BYTES, 0, 16); + from.copyFromHostBuffer(in); + to.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM); + Cuda.DEFAULT_STREAM.sync(); + verifyOutput(to); + } + } + + private void verifyOutput(HostMemoryBuffer out) { + byte[] bytes = new byte[16]; + out.getBytes(bytes, 0, 0, 16); + assertArrayEquals(EXPECTED, bytes); + } +} From 9970f1df40c848615e30b06a2b3d95bd413f8532 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Wed, 31 Mar 2021 16:23:10 -0500 Subject: [PATCH 13/14] Struct hashing support for SerialMurmur3 and SparkMurmur3 (#7714) Adding struct column support for serial Murmur3 and Spark-compatible Murmur3 hashing. This explodes the struct column into the leaf columns before passing it to the existing hash support. The validity of the parent struct columns can be ignored because hashing a null ends up as a no-op that returns the hash seed, so only the leaf columns within the struct column need to be considered for the hash computation. Authors: - Jason Lowe (https://github.com/jlowe) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Conor Hoekstra (https://github.com/codereport) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/7714 --- .../cudf/detail/utilities/hash_functions.cuh | 32 +++++ cpp/src/hash/hashing.cu | 25 +++- cpp/tests/hashing/hash_test.cpp | 125 ++++++++++++------ .../java/ai/rapids/cudf/ColumnVector.java | 5 +- .../java/ai/rapids/cudf/ColumnVectorTest.java | 48 ++++++- 5 files changed, 186 insertions(+), 49 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 31533a69487..e79107e32cf 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -542,6 +542,22 @@ hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32::operator()(double c return this->compute_floating_point(key); } +template <> +hash_value_type CUDA_DEVICE_CALLABLE +MurmurHash3_32::operator()(cudf::list_view const& key) const +{ + cudf_assert(false && "List column hashing is not supported"); + return 0; +} + +template <> +hash_value_type CUDA_DEVICE_CALLABLE +MurmurHash3_32::operator()(cudf::struct_view const& key) const +{ + cudf_assert(false && "Direct hashing of struct_view is not supported"); + return 0; +} + template struct SparkMurmurHash3_32 { using argument_type = Key; @@ -671,6 +687,22 @@ SparkMurmurHash3_32::operator()(numeric::decimal64 const& ke return this->compute(key.value()); } +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(cudf::list_view const& key) const +{ + cudf_assert(false && "List column hashing is not supported"); + return 0; +} + +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(cudf::struct_view const& key) const +{ + cudf_assert(false && "Direct hashing of struct_view is not supported"); + return 0; +} + /** * @brief Specialization of MurmurHash3_32 operator for strings. */ diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu index 16efb666b3e..53be019f73b 100644 --- a/cpp/src/hash/hashing.cu +++ b/cpp/src/hash/hashing.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,8 @@ #include +#include + namespace cudf { namespace { @@ -38,6 +40,22 @@ bool md5_type_check(data_type dt) return !is_chrono(dt) && (is_fixed_width(dt) || (dt.id() == type_id::STRING)); } +template +std::vector to_leaf_columns(IterType iter_begin, IterType iter_end) +{ + std::vector leaf_columns; + std::for_each(iter_begin, iter_end, [&leaf_columns](column_view const& col) { + if (is_nested(col.type())) { + CUDF_EXPECTS(col.type().id() == type_id::STRUCT, "unsupported nested type"); + auto child_columns = to_leaf_columns(col.child_begin(), col.child_end()); + leaf_columns.insert(leaf_columns.end(), child_columns.begin(), child_columns.end()); + } else { + leaf_columns.emplace_back(col); + } + }); + return leaf_columns; +} + } // namespace namespace detail { @@ -133,10 +151,11 @@ std::unique_ptr serial_murmur_hash3_32(table_view const& input, if (input.num_columns() == 0 || input.num_rows() == 0) { return output; } - auto const device_input = table_device_view::create(input, stream); + table_view const leaf_table(to_leaf_columns(input.begin(), input.end())); + auto const device_input = table_device_view::create(leaf_table, stream); auto output_view = output->mutable_view(); - if (has_nulls(input)) { + if (has_nulls(leaf_table)) { thrust::tabulate(rmm::exec_policy(stream), output_view.begin(), output_view.end(), diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp index 5641d445ff3..d928a17b3d1 100644 --- a/cpp/tests/hashing/hash_test.cpp +++ b/cpp/tests/hashing/hash_test.cpp @@ -257,20 +257,35 @@ TEST_F(SerialMurmurHash3Test, MultiValueWithSeeds) fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0}); fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0}); - auto const input1 = cudf::table_view({strings_col}); - auto const input2 = cudf::table_view({ints_col}); - auto const input3 = cudf::table_view({strings_col, ints_col, bools_col1}); - auto const input4 = cudf::table_view({strings_col, ints_col, bools_col2}); - - auto const hashed_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 314); - auto const hashed_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 42); - auto const hashed_output3 = cudf::hash(input3, cudf::hash_id::HASH_SERIAL_MURMUR3, {}); - auto const hashed_output4 = cudf::hash(input4, cudf::hash_id::HASH_SERIAL_MURMUR3, {}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output1->view(), strings_col_result, true); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output2->view(), ints_col_result, true); - EXPECT_EQ(input3.num_rows(), hashed_output3->size()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output3->view(), hashed_output4->view(), true); + std::vector> struct_field_cols; + struct_field_cols.emplace_back(std::make_unique(strings_col)); + struct_field_cols.emplace_back(std::make_unique(ints_col)); + struct_field_cols.emplace_back(std::make_unique(bools_col1)); + structs_column_wrapper structs_col(std::move(struct_field_cols)); + + auto const combo1 = cudf::table_view({strings_col, ints_col, bools_col1}); + auto const combo2 = cudf::table_view({strings_col, ints_col, bools_col2}); + + constexpr auto hasher = cudf::hash_id::HASH_SERIAL_MURMUR3; + auto const strings_hash = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314); + auto const ints_hash = cudf::hash(cudf::table_view({ints_col}), hasher, {}, 42); + auto const combo1_hash = cudf::hash(combo1, hasher, {}); + auto const combo2_hash = cudf::hash(combo2, hasher, {}); + auto const structs_hash = cudf::hash(cudf::table_view({structs_col}), hasher, {}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*strings_hash, strings_col_result, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ints_hash, ints_col_result, true); + EXPECT_EQ(combo1.num_rows(), combo1_hash->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*combo1_hash, *combo2_hash, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*structs_hash, *combo1_hash, true); +} + +TEST_F(SerialMurmurHash3Test, ListThrows) +{ + lists_column_wrapper strings_list_col({{""}, {"abc"}, {"123"}}); + EXPECT_THROW( + cudf::hash(cudf::table_view({strings_list_col}), cudf::hash_id::HASH_SERIAL_MURMUR3, {}), + cudf::logic_error); } class SparkMurmurHash3Test : public cudf::test::BaseFixture { @@ -280,31 +295,38 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) { // The hash values were determined by running the following Scala code in Apache Spark: // import org.apache.spark.sql.catalyst.util.DateTimeUtils - // val schema = new StructType().add("strings",StringType).add("doubles",DoubleType) - // .add("timestamps",TimestampType).add("decimal64", DecimalType(18,7)).add("longs",LongType) - // .add("floats",FloatType).add("dates",DateType).add("decimal32", DecimalType(9,3)) - // .add("ints",IntegerType).add("shorts",ShortType).add("bytes",ByteType) - // .add("bools",BooleanType) + // val schema = new StructType().add("structs", new StructType().add("a",IntegerType) + // .add("b",StringType).add("c",new StructType().add("x",FloatType).add("y",LongType))) + // .add("strings",StringType).add("doubles",DoubleType).add("timestamps",TimestampType) + // .add("decimal64", DecimalType(18,7)).add("longs",LongType).add("floats",FloatType) + // .add("dates",DateType).add("decimal32", DecimalType(9,3)).add("ints",IntegerType) + // .add("shorts",ShortType).add("bytes",ByteType).add("bools",BooleanType) // val data = Seq( - // Row("", 0.toDouble, DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), 0.toLong, 0.toFloat, - // DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte, false), - // Row("The quick brown fox", -(0.toDouble), DateTimeUtils.toJavaTimestamp(100), - // BigDecimal("0.00001"), 100.toLong, -(0.toFloat), DateTimeUtils.toJavaDate(100), - // BigDecimal("0.1"), 100, 100.toShort, 100.toByte, true), - // Row("jumps over the lazy dog.", -Double.NaN, DateTimeUtils.toJavaTimestamp(-100), - // BigDecimal("-0.00001"), -100.toLong, -Float.NaN, DateTimeUtils.toJavaDate(-100), - // BigDecimal("-0.1"), -100, -100.toShort, -100.toByte, true), - // Row("All work and no play makes Jack a dull boy", Double.MinValue, - // DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"), - // Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100), - // BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true), - // Row("!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue, - // DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"), - // Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100), - // BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false)) + // Row(Row(0, "a", Row(0f, 0L)), "", 0.toDouble, DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), + // 0.toLong, 0.toFloat, DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte, + // false), + // Row(Row(100, "bc", Row(100f, 100L)), "The quick brown fox", -(0.toDouble), + // DateTimeUtils.toJavaTimestamp(100), BigDecimal("0.00001"), 100.toLong, -(0.toFloat), + // DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte, true), + // Row(Row(-100, "def", Row(-100f, -100L)), "jumps over the lazy dog.", -Double.NaN, + // DateTimeUtils.toJavaTimestamp(-100), BigDecimal("-0.00001"), -100.toLong, -Float.NaN, + // DateTimeUtils.toJavaDate(-100), BigDecimal("-0.1"), -100, -100.toShort, -100.toByte, + // true), + // Row(Row(0x12345678, "ghij", Row(Float.PositiveInfinity, 0x123456789abcdefL)), + // "All work and no play makes Jack a dull boy", Double.MinValue, + // DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"), + // Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100), + // BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true), + // Row(Row(-0x76543210, "klmno", Row(Float.NegativeInfinity, -0x123456789abcdefL)), + // "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue, + // DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"), + // Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100), + // BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false)) // val df = spark.createDataFrame(sc.parallelize(data), schema) // df.columns.foreach(c => println(s"$c => ${df.select(hash(col(c))).collect.mkString(",")}")) // df.select(hash(col("*"))).collect + fixed_width_column_wrapper const hash_structs_expected( + {-105406170, 90479889, -678041645, 1667387937, 301478567}); fixed_width_column_wrapper const hash_strings_expected( {1467149710, 723257560, -1620282500, -2001858707, 1588473657}); fixed_width_column_wrapper const hash_doubles_expected( @@ -330,18 +352,26 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) fixed_width_column_wrapper const hash_bools_expected( {933211791, -559580957, -559580957, -559580957, 933211791}); fixed_width_column_wrapper const hash_combined_expected( - {-1947042614, -1731440908, 807283935, 725489209, 822276819}); + {-1172364561, -442972638, 1213234395, 796626751, 214075225}); + + using double_limits = std::numeric_limits; + using long_limits = std::numeric_limits; + using float_limits = std::numeric_limits; + using int_limits = std::numeric_limits; + fixed_width_column_wrapper a_col{0, 100, -100, 0x12345678, -0x76543210}; + strings_column_wrapper b_col{"a", "bc", "def", "ghij", "klmno"}; + fixed_width_column_wrapper x_col{ + 0.f, 100.f, -100.f, float_limits::infinity(), -float_limits::infinity()}; + fixed_width_column_wrapper y_col{ + 0L, 100L, -100L, 0x123456789abcdefL, -0x123456789abcdefL}; + structs_column_wrapper c_col{{x_col, y_col}}; + structs_column_wrapper const structs_col{{a_col, b_col, c_col}}; strings_column_wrapper const strings_col({"", "The quick brown fox", "jumps over the lazy dog.", "All work and no play makes Jack a dull boy", "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"}); - - using double_limits = std::numeric_limits; - using long_limits = std::numeric_limits; - using float_limits = std::numeric_limits; - using int_limits = std::numeric_limits; fixed_width_column_wrapper const doubles_col( {0., -0., -double_limits::quiet_NaN(), double_limits::lowest(), double_limits::max()}); fixed_width_column_wrapper const timestamps_col( @@ -364,6 +394,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0}); constexpr auto hasher = cudf::hash_id::HASH_SPARK_MURMUR3; + auto const hash_structs = cudf::hash(cudf::table_view({structs_col}), hasher, {}, 42); auto const hash_strings = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314); auto const hash_doubles = cudf::hash(cudf::table_view({doubles_col}), hasher, {}, 42); auto const hash_timestamps = cudf::hash(cudf::table_view({timestamps_col}), hasher, {}, 42); @@ -378,6 +409,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) auto const hash_bools1 = cudf::hash(cudf::table_view({bools_col1}), hasher, {}, 42); auto const hash_bools2 = cudf::hash(cudf::table_view({bools_col2}), hasher, {}, 42); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_doubles, hash_doubles_expected, true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_timestamps, hash_timestamps_expected, true); @@ -392,7 +424,8 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, true); - auto const combined_table = cudf::table_view({strings_col, + auto const combined_table = cudf::table_view({structs_col, + strings_col, doubles_col, timestamps_col, decimal64_col, @@ -408,6 +441,14 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, true); } +TEST_F(SparkMurmurHash3Test, ListThrows) +{ + lists_column_wrapper strings_list_col({{""}, {"abc"}, {"123"}}); + EXPECT_THROW( + cudf::hash(cudf::table_view({strings_list_col}), cudf::hash_id::HASH_SPARK_MURMUR3, {}), + cudf::logic_error); +} + class MD5HashTest : public cudf::test::BaseFixture { }; diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index e6675591164..fcdb5d44ad3 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -570,8 +570,7 @@ public static ColumnVector serial32BitMurmurHash3(int seed, ColumnView columns[] assert columns[i] != null : "Column vectors passed may not be null"; assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size"; assert !columns[i].getType().isDurationType() : "Unsupported column type Duration"; - assert !columns[i].getType().isTimestampType() : "Unsupported column type Timestamp"; - assert !columns[i].getType().isNestedType() : "Unsupported column of nested type"; + assert !columns[i].getType().equals(DType.LIST) : "List columns are not supported"; columnViews[i] = columns[i].getNativeView(); } return new ColumnVector(hash(columnViews, HashType.HASH_SERIAL_MURMUR3.getNativeId(), new int[0], seed)); @@ -606,7 +605,7 @@ public static ColumnVector spark32BitMurmurHash3(int seed, ColumnView columns[]) assert columns[i] != null : "Column vectors passed may not be null"; assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size"; assert !columns[i].getType().isDurationType() : "Unsupported column type Duration"; - assert !columns[i].getType().isNestedType() : "Unsupported column of nested type"; + assert !columns[i].getType().equals(DType.LIST) : "List columns are not supported"; columnViews[i] = columns[i].getNativeView(); } return new ColumnVector(hash(columnViews, HashType.HASH_SPARK_MURMUR3.getNativeId(), new int[0], seed)); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index ce2c287a1c8..36123704ae6 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -490,6 +490,25 @@ void testSerial32BitMurmur3HashMixed() { } } + @Test + void testSerial32BitMurmur3HashStruct() { + try (ColumnVector strings = ColumnVector.fromStrings( + "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721", + "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " + + "in the MD5 hash function. This string needed to be longer.", + null, null); + ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null); + ColumnVector doubles = ColumnVector.fromBoxedDoubles( + 0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null); + ColumnVector floats = ColumnVector.fromBoxedFloats( + 0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null); + ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null); + ColumnVector result = ColumnVector.serial32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools}); + ColumnVector expected = ColumnVector.fromBoxedInts(387200465, 1988790727, 774895031, 814731646, -1073686048, 1868)) { + assertColumnsAreEqual(expected, result); + } + } + @Test void testSpark32BitMurmur3HashStrings() { try (ColumnVector v0 = ColumnVector.fromStrings( @@ -529,6 +548,8 @@ void testSpark32BitMurmur3HashDoubles() { @Test void testSpark32BitMurmur3HashTimestamps() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 try (ColumnVector v = ColumnVector.timestampMicroSecondsFromBoxedLongs( 0L, null, 100L, -100L, 0x123456789abcdefL, null, -0x123456789abcdefL); ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); @@ -539,6 +560,8 @@ void testSpark32BitMurmur3HashTimestamps() { @Test void testSpark32BitMurmur3HashDecimal64() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 try (ColumnVector v = ColumnVector.decimalFromLongs(-7, 0L, 100L, -100L, 0x123456789abcdefL, -0x123456789abcdefL); ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); @@ -549,6 +572,8 @@ void testSpark32BitMurmur3HashDecimal64() { @Test void testSpark32BitMurmur3HashDecimal32() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 try (ColumnVector v = ColumnVector.decimalFromInts(-3, 0, 100, -100, 0x12345678, -0x12345678); ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); @@ -559,6 +584,8 @@ void testSpark32BitMurmur3HashDecimal32() { @Test void testSpark32BitMurmur3HashDates() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 try (ColumnVector v = ColumnVector.timestampDaysFromBoxedInts( 0, null, 100, -100, 0x12345678, null, -0x12345678); ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); @@ -587,7 +614,6 @@ void testSpark32BitMurmur3HashBools() { ColumnVector result = ColumnVector.spark32BitMurmurHash3(0, new ColumnVector[]{v0, v1}); ColumnVector expected = ColumnVector.fromBoxedInts(0, -1589400010, -239939054, -68075478, 593689054, -1194558265)) { assertColumnsAreEqual(expected, result); - } } @@ -610,6 +636,26 @@ void testSpark32BitMurmur3HashMixed() { } } + @Test + void testSpark32BitMurmur3HashStruct() { + try (ColumnVector strings = ColumnVector.fromStrings( + "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721", + "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " + + "in the MD5 hash function. This string needed to be longer.", + null, null); + ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null); + ColumnVector doubles = ColumnVector.fromBoxedDoubles( + 0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null); + ColumnVector floats = ColumnVector.fromBoxedFloats( + 0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null); + ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null); + ColumnView structs = ColumnView.makeStructView(strings, integers, doubles, floats, bools); + ColumnVector result = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{structs}); + ColumnVector expected = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools})) { + assertColumnsAreEqual(expected, result); + } + } + @Test void testAndNullReconfigureNulls() { try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null); From 684bb146b918bdbaaf3d7ce47d00b51245fe12e7 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 31 Mar 2021 16:23:37 -0500 Subject: [PATCH 14/14] Fix inplace update of data and add Series.update (#7201) Fixes: #7187 This PR: - [x] Fixes inplace manipulation of columns. - [x] Introduces `Series.update` - [x] Fixes incorrect dtype handling in `Frame.where` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) - Keith Kraus (https://github.com/kkraus14) URL: https://github.com/rapidsai/cudf/pull/7201 --- python/cudf/cudf/_lib/copying.pyx | 6 +- python/cudf/cudf/core/__init__.py | 4 +- python/cudf/cudf/core/_internals/__init__.py | 3 + python/cudf/cudf/core/_internals/where.py | 383 +++++++++++++++++++ python/cudf/cudf/core/dataframe.py | 8 +- python/cudf/cudf/core/frame.py | 232 +---------- python/cudf/cudf/core/series.py | 104 +++++ python/cudf/cudf/tests/test_dataframe.py | 13 +- python/cudf/cudf/tests/test_replace.py | 27 +- python/cudf/cudf/tests/test_series.py | 49 +++ python/cudf/cudf/utils/dtypes.py | 66 +++- 11 files changed, 646 insertions(+), 249 deletions(-) create mode 100644 python/cudf/cudf/core/_internals/__init__.py create mode 100644 python/cudf/cudf/core/_internals/where.py diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 4c72ba2e055..8f93866612e 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. import pandas as pd @@ -564,11 +564,11 @@ def copy_if_else(object lhs, object rhs, Column boolean_mask): return _copy_if_else_column_column(lhs, rhs, boolean_mask) else: return _copy_if_else_column_scalar( - lhs, as_device_scalar(rhs, lhs.dtype), boolean_mask) + lhs, as_device_scalar(rhs), boolean_mask) else: if isinstance(rhs, Column): return _copy_if_else_scalar_column( - as_device_scalar(lhs, rhs.dtype), rhs, boolean_mask) + as_device_scalar(lhs), rhs, boolean_mask) else: if lhs is None and rhs is None: return lhs diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py index 91a369c31f8..59173cc0247 100644 --- a/python/cudf/cudf/core/__init__.py +++ b/python/cudf/cudf/core/__init__.py @@ -1,6 +1,6 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. -from cudf.core import buffer, column, column_accessor, common +from cudf.core import _internals, buffer, column, column_accessor, common from cudf.core.buffer import Buffer from cudf.core.dataframe import DataFrame, from_pandas, merge from cudf.core.index import ( diff --git a/python/cudf/cudf/core/_internals/__init__.py b/python/cudf/cudf/core/_internals/__init__.py new file mode 100644 index 00000000000..53d186def85 --- /dev/null +++ b/python/cudf/cudf/core/_internals/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from cudf.core._internals.where import where diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py new file mode 100644 index 00000000000..1fdc907875e --- /dev/null +++ b/python/cudf/cudf/core/_internals/where.py @@ -0,0 +1,383 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +import warnings +from typing import Any, Optional, Tuple, Union, cast + +import numpy as np +import pandas as pd + +import cudf +from cudf._typing import ColumnLike, ScalarLike +from cudf.core.column import ColumnBase +from cudf.core.dataframe import DataFrame +from cudf.core.frame import Frame +from cudf.core.index import Index +from cudf.core.series import Series + + +def _normalize_scalars(col: ColumnBase, other: ScalarLike) -> ScalarLike: + """ + Try to normalize scalar values as per col dtype + """ + if (isinstance(other, float) and not np.isnan(other)) and ( + col.dtype.type(other) != other + ): + raise TypeError( + f"Cannot safely cast non-equivalent " + f"{type(other).__name__} to {col.dtype.name}" + ) + + return cudf.Scalar(other, dtype=col.dtype if other is None else None) + + +def _check_and_cast_columns_with_other( + source_col: ColumnBase, + other: Union[ScalarLike, ColumnBase], + inplace: bool, +) -> Tuple[ColumnBase, Union[ScalarLike, ColumnBase]]: + """ + Returns type-casted column `source_col` & scalar `other_scalar` + based on `inplace` parameter. + """ + if cudf.utils.dtypes.is_categorical_dtype(source_col.dtype): + return source_col, other + + if cudf.utils.dtypes.is_scalar(other): + device_obj = _normalize_scalars(source_col, other) + else: + device_obj = other + + if other is None: + return source_col, device_obj + elif cudf.utils.dtypes.is_mixed_with_object_dtype(device_obj, source_col): + raise TypeError( + "cudf does not support mixed types, please type-cast " + "the column of dataframe/series and other " + "to same dtypes." + ) + if inplace: + if not cudf.utils.dtypes._can_cast(device_obj.dtype, source_col.dtype): + warnings.warn( + f"Type-casting from {device_obj.dtype} " + f"to {source_col.dtype}, there could be potential data loss" + ) + return source_col, device_obj.astype(source_col.dtype) + else: + if ( + cudf.utils.dtypes.is_scalar(other) + and cudf.utils.dtypes.is_numerical_dtype(source_col.dtype) + and cudf.utils.dtypes._can_cast(other, source_col.dtype) + ): + common_dtype = source_col.dtype + return ( + source_col.astype(common_dtype), + cudf.Scalar(other, dtype=common_dtype), + ) + else: + common_dtype = cudf.utils.dtypes.find_common_type( + [ + source_col.dtype, + np.min_scalar_type(other) + if cudf.utils.dtypes.is_scalar(other) + else other.dtype, + ] + ) + if cudf.utils.dtypes.is_scalar(device_obj): + device_obj = cudf.Scalar(other, dtype=common_dtype) + else: + device_obj = device_obj.astype(common_dtype) + return source_col.astype(common_dtype), device_obj + + +def _normalize_columns_and_scalars_type( + frame: Union[Series, Index, DataFrame], other: Any, inplace: bool = False, +) -> Tuple[ + Union[Series, Index, DataFrame, ColumnLike], Any, +]: + """ + Try to normalize the other's dtypes as per frame. + + Parameters + ---------- + + frame : Can be a DataFrame or Series or Index + other : Can be a DataFrame, Series, Index, Array + like object or a scalar value + + if frame is DataFrame, other can be only a + scalar or array like with size of number of columns + in DataFrame or a DataFrame with same dimension + + if frame is Series, other can be only a scalar or + a series like with same length as frame + + Returns: + -------- + A dataframe/series/list/scalar form of normalized other + """ + if isinstance(frame, DataFrame) and isinstance(other, DataFrame): + source_df = frame.copy(deep=False) + other_df = other.copy(deep=False) + for self_col in source_df._column_names: + source_col, other_col = _check_and_cast_columns_with_other( + source_col=source_df._data[self_col], + other=other_df._data[self_col], + inplace=inplace, + ) + source_df._data[self_col] = source_col + other_df._data[self_col] = other_col + return source_df, other_df + + elif isinstance( + frame, (Series, Index) + ) and not cudf.utils.dtypes.is_scalar(other): + other = cudf.core.column.as_column(other) + input_col = frame._data[frame.name] + return _check_and_cast_columns_with_other( + source_col=input_col, other=other, inplace=inplace + ) + else: + # Handles scalar or list/array like scalars + if isinstance(frame, (Series, Index)) and cudf.utils.dtypes.is_scalar( + other + ): + input_col = frame._data[frame.name] + return _check_and_cast_columns_with_other( + source_col=frame._data[frame.name], + other=other, + inplace=inplace, + ) + + elif isinstance(frame, DataFrame): + if cudf.utils.dtypes.is_scalar(other): + other = [other for i in range(len(frame._column_names))] + + source_df = frame.copy(deep=False) + others = [] + for col_name, other_sclr in zip(frame._column_names, other): + + ( + source_col, + other_scalar, + ) = _check_and_cast_columns_with_other( + source_col=source_df._data[col_name], + other=other_sclr, + inplace=inplace, + ) + source_df._data[col_name] = source_col + others.append(other_scalar) + return source_df, others + else: + raise ValueError( + f"Inappropriate input {type(frame)} " + f"and other {type(other)} combination" + ) + + +def where( + frame: Union[Series, Index, DataFrame], + cond: Any, + other: Any = None, + inplace: bool = False, +) -> Optional[Union[Frame]]: + """ + Replace values where the condition is False. + + Parameters + ---------- + cond : bool Series/DataFrame, array-like + Where cond is True, keep the original value. + Where False, replace with corresponding value from other. + Callables are not supported. + other: scalar, list of scalars, Series/DataFrame + Entries where cond is False are replaced with + corresponding value from other. Callables are not + supported. Default is None. + + DataFrame expects only Scalar or array like with scalars or + dataframe with same dimension as frame. + + Series expects only scalar or series like with same length + inplace : bool, default False + Whether to perform the operation in place on the data. + + Returns + ------- + Same type as caller + + Examples + -------- + >>> import cudf + >>> df = DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]}) + >>> df.where(df % 2 == 0, [-1, -1]) + A B + 0 -1 -1 + 1 4 -1 + 2 -1 8 + + >>> ser = Series([4, 3, 2, 1, 0]) + >>> ser.where(ser > 2, 10) + 0 4 + 1 3 + 2 10 + 3 10 + 4 10 + dtype: int64 + >>> ser.where(ser > 2) + 0 4 + 1 3 + 2 + 3 + 4 + dtype: int64 + """ + + if isinstance(frame, DataFrame): + if hasattr(cond, "__cuda_array_interface__"): + cond = DataFrame( + cond, columns=frame._column_names, index=frame.index + ) + elif ( + hasattr(cond, "__array_interface__") + and cond.__array_interface__["shape"] != frame.shape + ): + raise ValueError("conditional must be same shape as self") + elif not isinstance(cond, DataFrame): + cond = frame.from_pandas(pd.DataFrame(cond)) + + common_cols = set(frame._column_names).intersection( + set(cond._column_names) + ) + if len(common_cols) > 0: + # If `frame` and `cond` are having unequal index, + # then re-index `cond`. + if not frame.index.equals(cond.index): + cond = cond.reindex(frame.index) + else: + if cond.shape != frame.shape: + raise ValueError( + """Array conditional must be same shape as self""" + ) + # Setting `frame` column names to `cond` + # as `cond` has no column names. + cond.columns = frame.columns + + (source_df, others,) = _normalize_columns_and_scalars_type( + frame, other + ) + if isinstance(other, Frame): + others = others._data.columns + + out_df = DataFrame(index=frame.index) + if len(frame._columns) != len(others): + raise ValueError( + """Replacement list length or number of dataframe columns + should be equal to Number of columns of dataframe""" + ) + for i, column_name in enumerate(frame._column_names): + input_col = source_df._data[column_name] + other_column = others[i] + if column_name in cond._data: + if isinstance(input_col, cudf.core.column.CategoricalColumn): + if cudf.utils.dtypes.is_scalar(other_column): + try: + other_column = input_col._encode(other_column) + except ValueError: + # When other is not present in categories, + # fill with Null. + other_column = None + other_column = cudf.Scalar( + other_column, dtype=input_col.codes.dtype + ) + elif isinstance( + other_column, cudf.core.column.CategoricalColumn + ): + other_column = other_column.codes + input_col = input_col.codes + + result = cudf._lib.copying.copy_if_else( + input_col, other_column, cond._data[column_name] + ) + + if isinstance( + frame._data[column_name], + cudf.core.column.CategoricalColumn, + ): + result = cudf.core.column.build_categorical_column( + categories=frame._data[column_name].categories, + codes=cudf.core.column.as_column( + result.base_data, dtype=result.dtype + ), + mask=result.base_mask, + size=result.size, + offset=result.offset, + ordered=frame._data[column_name].ordered, + ) + else: + out_mask = cudf._lib.null_mask.create_null_mask( + len(input_col), + state=cudf._lib.null_mask.MaskState.ALL_NULL, + ) + result = input_col.set_mask(out_mask) + out_df[column_name] = frame[column_name].__class__(result) + + return frame._mimic_inplace(out_df, inplace=inplace) + + else: + if isinstance(other, DataFrame): + raise NotImplementedError( + "cannot align with a higher dimensional Frame" + ) + input_col = frame._data[frame.name] + cond = cudf.core.column.as_column(cond) + if len(cond) != len(frame): + raise ValueError( + """Array conditional must be same shape as self""" + ) + + (input_col, other,) = _normalize_columns_and_scalars_type( + frame, other, inplace + ) + + if isinstance(input_col, cudf.core.column.CategoricalColumn): + if cudf.utils.dtypes.is_scalar(other): + try: + other = input_col._encode(other) + except ValueError: + # When other is not present in categories, + # fill with Null. + other = None + other = cudf.Scalar(other, dtype=input_col.codes.dtype) + elif isinstance(other, cudf.core.column.CategoricalColumn): + other = other.codes + + input_col = input_col.codes + + result = cudf._lib.copying.copy_if_else(input_col, other, cond) + + if isinstance( + frame._data[frame.name], cudf.core.column.CategoricalColumn + ): + result = cudf.core.column.build_categorical_column( + categories=cast( + cudf.core.column.CategoricalColumn, + frame._data[frame.name], + ).categories, + codes=cudf.core.column.as_column( + result.base_data, dtype=result.dtype + ), + mask=result.base_mask, + size=result.size, + offset=result.offset, + ordered=cast( + cudf.core.column.CategoricalColumn, + frame._data[frame.name], + ).ordered, + ) + + if isinstance(frame, Index): + result = Index(result, name=frame.name) + else: + result = frame._copy_construct(data=result) + + return frame._mimic_inplace(result, inplace=inplace) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 01b96151485..6639fc7c25c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1658,8 +1658,9 @@ def update( if not self.index.equals(other.index): other = other.reindex(self.index, axis=0) - for col in self.columns: - this = self[col] + source_df = self.copy(deep=False) + for col in source_df._column_names: + this = source_df[col] that = other[col] if errors == "raise": @@ -1676,8 +1677,9 @@ def update( # don't overwrite columns unnecessarily if mask.all(): continue + source_df[col] = source_df[col].where(mask, that) - self[col] = this.where(mask, that) + self._mimic_inplace(source_df, inplace=True) def __add__(self, other): return self._apply_op("__add__", other) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index fb746d6c794..bc43c367833 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,7 @@ import functools import warnings from collections import OrderedDict, abc as abc -from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, Union, overload +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, TypeVar, Union import cupy import numpy as np @@ -14,7 +14,6 @@ import pyarrow as pa from nvtx import annotate from pandas.api.types import is_dict_like, is_dtype_equal -from typing_extensions import Literal import cudf from cudf import _lib as libcudf @@ -53,19 +52,9 @@ class Frame(libcudf.table.Table): def _from_table(cls, table: Frame): return cls(table._data, index=table._index) - @overload - def _mimic_inplace(self, result: Frame) -> Frame: - ... - - @overload - def _mimic_inplace(self, result: Frame, inplace: Literal[True]): - ... - - @overload - def _mimic_inplace(self, result: Frame, inplace: Literal[False]) -> Frame: - ... - - def _mimic_inplace(self, result, inplace=False): + def _mimic_inplace( + self: T, result: Frame, inplace: bool = False + ) -> Optional[Frame]: if inplace: for col in self._data: if col in result._data: @@ -74,6 +63,7 @@ def _mimic_inplace(self, result, inplace=False): ) self._data = result._data self._index = result._index + return None else: return result @@ -796,87 +786,6 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): return self._mimic_inplace(output, inplace=inplace) - def _normalize_scalars(self, other): - """ - Try to normalizes scalar values as per self dtype - """ - if ( - other is not None - and (isinstance(other, float) and not np.isnan(other)) - ) and (self.dtype.type(other) != other): - raise TypeError( - f"Cannot safely cast non-equivalent " - f"{type(other).__name__} to {self.dtype.name}" - ) - - return ( - self.dtype.type(other) - if ( - other is not None - and (isinstance(other, float) and not np.isnan(other)) - ) - else other - ) - - def _normalize_columns_and_scalars_type(self, other): - """ - Try to normalize the other's dtypes as per self. - - Parameters - ---------- - - self : Can be a DataFrame or Series or Index - other : Can be a DataFrame, Series, Index, Array - like object or a scalar value - - if self is DataFrame, other can be only a - scalar or array like with size of number of columns - in DataFrame or a DataFrame with same dimension - - if self is Series, other can be only a scalar or - a series like with same length as self - - Returns: - -------- - A dataframe/series/list/scalar form of normalized other - """ - if isinstance(self, cudf.DataFrame) and isinstance( - other, cudf.DataFrame - ): - return [ - other[self_col].astype(self._data[self_col].dtype)._column - for self_col in self._data.names - ] - - elif isinstance(self, (cudf.Series, cudf.Index)) and not is_scalar( - other - ): - other = as_column(other) - return other.astype(self.dtype) - - else: - # Handles scalar or list/array like scalars - if isinstance(self, (cudf.Series, cudf.Index)) and is_scalar( - other - ): - return self._normalize_scalars(other) - - elif isinstance(self, cudf.DataFrame): - out = [] - if is_scalar(other): - other = [other for i in range(len(self._data.names))] - out = [ - self[in_col_name]._normalize_scalars(sclr) - for in_col_name, sclr in zip(self._data.names, other) - ] - - return out - else: - raise ValueError( - f"Inappropriate input {type(self)} " - f"and other {type(other)} combination" - ) - def where(self, cond, other=None, inplace=False): """ Replace values where the condition is False. @@ -930,133 +839,9 @@ def where(self, cond, other=None, inplace=False): dtype: int64 """ - if isinstance(self, cudf.DataFrame): - if hasattr(cond, "__cuda_array_interface__"): - cond = cudf.DataFrame( - cond, columns=self._data.names, index=self.index - ) - elif not isinstance(cond, cudf.DataFrame): - cond = self.from_pandas(pd.DataFrame(cond)) - - common_cols = set(self._data.names).intersection( - set(cond._data.names) - ) - if len(common_cols) > 0: - # If `self` and `cond` are having unequal index, - # then re-index `cond`. - if not self.index.equals(cond.index): - cond = cond.reindex(self.index) - else: - if cond.shape != self.shape: - raise ValueError( - """Array conditional must be same shape as self""" - ) - # Setting `self` column names to `cond` - # as `cond` has no column names. - cond.columns = self.columns - - other = self._normalize_columns_and_scalars_type(other) - out_df = cudf.DataFrame(index=self.index) - if len(self._columns) != len(other): - raise ValueError( - """Replacement list length or number of dataframe columns - should be equal to Number of columns of dataframe""" - ) - - for column_name, other_column in zip(self._data.names, other): - input_col = self._data[column_name] - if column_name in cond._data: - if isinstance( - input_col, cudf.core.column.CategoricalColumn - ): - if np.isscalar(other_column): - try: - other_column = input_col._encode(other_column) - except ValueError: - # When other is not present in categories, - # fill with Null. - other_column = None - elif hasattr(other_column, "codes"): - other_column = other_column.codes - input_col = input_col.codes - - result = libcudf.copying.copy_if_else( - input_col, other_column, cond._data[column_name] - ) - - if isinstance( - self._data[column_name], - cudf.core.column.CategoricalColumn, - ): - result = build_categorical_column( - categories=self._data[column_name].categories, - codes=as_column( - result.base_data, dtype=result.dtype - ), - mask=result.base_mask, - size=result.size, - offset=result.offset, - ordered=self._data[column_name].ordered, - ) - else: - from cudf._lib.null_mask import MaskState, create_null_mask - - out_mask = create_null_mask( - len(input_col), state=MaskState.ALL_NULL - ) - result = input_col.set_mask(out_mask) - out_df[column_name] = self[column_name].__class__(result) - - return self._mimic_inplace(out_df, inplace=inplace) - - else: - - if isinstance(other, cudf.DataFrame): - raise NotImplementedError( - "cannot align with a higher dimensional Frame" - ) - - other = self._normalize_columns_and_scalars_type(other) - - cond = as_column(cond) - if len(cond) != len(self): - raise ValueError( - """Array conditional must be same shape as self""" - ) - input_col = self._data[self.name] - if isinstance(input_col, cudf.core.column.CategoricalColumn): - if np.isscalar(other): - try: - other = input_col._encode(other) - except ValueError: - # When other is not present in categories, - # fill with Null. - other = None - elif hasattr(other, "codes"): - other = other.codes - - input_col = input_col.codes - - result = libcudf.copying.copy_if_else(input_col, other, cond) - - if is_categorical_dtype(self.dtype): - result = build_categorical_column( - categories=self._data[self.name].categories, - codes=as_column(result.base_data, dtype=result.dtype), - mask=result.base_mask, - size=result.size, - offset=result.offset, - ordered=self._data[self.name].ordered, - ) - - if isinstance(self, cudf.Index): - from cudf.core.index import as_index - - result = as_index(result, name=self.name) - else: - result = self._copy_construct(data=result) - - return self._mimic_inplace(result, inplace=inplace) + return cudf.core._internals.where( + frame=self, cond=cond, other=other, inplace=inplace + ) def mask(self, cond, other=None, inplace=False): """ @@ -2735,7 +2520,6 @@ def searchsorted( array([4, 4, 4, 0], dtype=int32) """ # Call libcudf++ search_sorted primitive - from cudf.utils.dtypes import is_scalar scalar_flag = None if is_scalar(values): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 71a4a48a07a..955519d0b57 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3923,6 +3923,110 @@ def replace( return self._mimic_inplace(result, inplace=inplace) + def update(self, other): + """ + Modify Series in place using values from passed Series. + Uses non-NA values from passed Series to make updates. Aligns + on index. + + Parameters + ---------- + other : Series, or object coercible into Series + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update(cudf.Series([4, 5, 6])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: int64 + >>> s = cudf.Series(['a', 'b', 'c']) + >>> s + 0 a + 1 b + 2 c + dtype: object + >>> s.update(cudf.Series(['d', 'e'], index=[0, 2])) + >>> s + 0 d + 1 b + 2 e + dtype: object + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update(cudf.Series([4, 5, 6, 7, 8])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: int64 + + If ``other`` contains NaNs the corresponding values are not updated + in the original Series. + + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update(cudf.Series([4, np.nan, 6], nan_as_null=False)) + >>> s + 0 4 + 1 2 + 2 6 + dtype: int64 + + ``other`` can also be a non-Series object type + that is coercible into a Series + + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update([4, np.nan, 6]) + >>> s + 0 4 + 1 2 + 2 6 + dtype: int64 + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update({1: 9}) + >>> s + 0 1 + 1 9 + 2 3 + dtype: int64 + """ + + if not isinstance(other, cudf.Series): + other = cudf.Series(other) + + if not self.index.equals(other.index): + other = other.reindex(index=self.index) + mask = other.notna() + + self.mask(mask, other, inplace=True) + def reverse(self): """ Reverse the Series diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d72b88f1713..f068d02d575 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8215,9 +8215,6 @@ def test_agg_for_dataframe_with_string_columns(aggs): @pytest.mark.parametrize( "overwrite", [True, False], ) -@pytest.mark.parametrize( - "filter_func", [None], -) @pytest.mark.parametrize( "errors", ["ignore"], ) @@ -8262,19 +8259,17 @@ def test_agg_for_dataframe_with_string_columns(aggs): }, ], ) -def test_update_for_dataframes( - data, data2, join, overwrite, filter_func, errors -): +def test_update_for_dataframes(data, data2, join, overwrite, errors): pdf = pd.DataFrame(data) gdf = cudf.DataFrame(data) other_pd = pd.DataFrame(data2) other_gd = cudf.DataFrame(data2) - expect = pdf.update(other_pd, join, overwrite, filter_func, errors) - got = gdf.update(other_gd, join, overwrite, filter_func, errors) + pdf.update(other=other_pd, join=join, overwrite=overwrite, errors=errors) + gdf.update(other=other_gd, join=join, overwrite=overwrite, errors=errors) - assert_eq(expect, got) + assert_eq(pdf, gdf, check_dtype=False) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index e7baa4ee926..65ce2a79992 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -709,25 +709,40 @@ def test_series_where(data_dtype, fill_value): sr.where(sr > 0, fill_value) else: # Cast back to original dtype as pandas automatically upcasts - expect = psr.where(psr > 0, fill_value).astype(psr.dtype) + expect = psr.where(psr > 0, fill_value) got = sr.where(sr > 0, fill_value) - assert_eq(expect, got) + # pandas returns 'float16' dtype, which is not supported in cudf + assert_eq( + expect, + got, + check_dtype=False if expect.dtype.kind in ("f") else True, + ) if sr.dtype.type(fill_value) != fill_value: with pytest.raises(TypeError): sr.where(sr < 0, fill_value) else: - expect = psr.where(psr < 0, fill_value).astype(psr.dtype) + expect = psr.where(psr < 0, fill_value) got = sr.where(sr < 0, fill_value) - assert_eq(expect, got) + # pandas returns 'float16' dtype, which is not supported in cudf + assert_eq( + expect, + got, + check_dtype=False if expect.dtype.kind in ("f") else True, + ) if sr.dtype.type(fill_value) != fill_value: with pytest.raises(TypeError): sr.where(sr == 0, fill_value) else: - expect = psr.where(psr == 0, fill_value).astype(psr.dtype) + expect = psr.where(psr == 0, fill_value) got = sr.where(sr == 0, fill_value) - assert_eq(expect, got) + # pandas returns 'float16' dtype, which is not supported in cudf + assert_eq( + expect, + got, + check_dtype=False if expect.dtype.kind in ("f") else True, + ) @pytest.mark.parametrize("fill_value", [100, 100.0, 100.5]) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index beda14934ca..0dc53fa29e9 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -921,6 +921,42 @@ def custom_add_func(sr, val): ) +@pytest.mark.parametrize( + "data", + [cudf.Series([1, 2, 3]), cudf.Series([10, 11, 12], index=[1, 2, 3])], +) +@pytest.mark.parametrize( + "other", + [ + cudf.Series([4, 5, 6]), + cudf.Series([4, 5, 6, 7, 8]), + cudf.Series([4, np.nan, 6], nan_as_null=False), + [4, np.nan, 6], + {1: 9}, + ], +) +def test_series_update(data, other): + gs = data.copy(deep=True) + if isinstance(other, cudf.Series): + g_other = other.copy(deep=True) + p_other = g_other.to_pandas() + else: + g_other = other + p_other = other + + ps = gs.to_pandas() + + gs_column_before = gs._column + gs.update(g_other) + gs_column_after = gs._column + + assert_eq(gs_column_before.to_array(), gs_column_after.to_array()) + + ps.update(p_other) + + assert_eq(gs, ps) + + @pytest.mark.parametrize( "data", [ @@ -942,6 +978,19 @@ def test_fillna_with_nan(data, nan_as_null, fill_value): assert_eq(expected, actual) +def test_series_mask_mixed_dtypes_error(): + s = cudf.Series(["a", "b", "c"]) + with pytest.raises( + TypeError, + match=re.escape( + "cudf does not support mixed types, please type-cast " + "the column of dataframe/series and other " + "to same dtypes." + ), + ): + s.where([True, False, True], [1, 2, 3]) + + @pytest.mark.parametrize( "ps", [ diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 8af225ecb58..be2b1bca2e0 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -345,7 +345,7 @@ def to_cudf_compatible_scalar(val, dtype=None): if not is_scalar(val): raise ValueError( f"Cannot convert value of type {type(val).__name__} " - " to cudf scalar" + "to cudf scalar" ) if isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0: @@ -637,6 +637,11 @@ def find_common_type(dtypes): # Aggregate same types dtypes = set(dtypes) + if any(is_decimal_dtype(dtype) for dtype in dtypes): + raise NotImplementedError( + "DecimalDtype is not yet supported in find_common_type" + ) + # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately dt_dtypes = set(filter(lambda t: is_datetime_dtype(t), dtypes)) @@ -651,7 +656,64 @@ def find_common_type(dtypes): dtypes = dtypes - td_dtypes dtypes.add(np.result_type(*td_dtypes)) - return np.find_common_type(list(dtypes), []) + common_dtype = np.find_common_type(list(dtypes), []) + if common_dtype == np.dtype("float16"): + # cuDF does not support float16 dtype + return np.dtype("float32") + else: + return common_dtype + + +def _can_cast(from_dtype, to_dtype): + """ + Utility function to determine if we can cast + from `from_dtype` to `to_dtype`. This function primarily calls + `np.can_cast` but with some special handling around + cudf specific dtypes. + """ + if isinstance(from_dtype, type): + from_dtype = np.dtype(from_dtype) + if isinstance(to_dtype, type): + to_dtype = np.dtype(to_dtype) + + # TODO : Add precision & scale checking for + # decimal types in future + if isinstance(from_dtype, cudf.core.dtypes.Decimal64Dtype): + if isinstance(to_dtype, cudf.core.dtypes.Decimal64Dtype): + return True + elif isinstance(to_dtype, np.dtype): + if to_dtype.kind in {"i", "f", "u", "U", "O"}: + return True + else: + return False + elif isinstance(from_dtype, np.dtype): + if isinstance(to_dtype, np.dtype): + return np.can_cast(from_dtype, to_dtype) + elif isinstance(to_dtype, cudf.core.dtypes.Decimal64Dtype): + if from_dtype.kind in {"i", "f", "u", "U", "O"}: + return True + else: + return False + elif isinstance(to_dtype, cudf.core.types.CategoricalDtype): + return True + else: + return False + elif isinstance(from_dtype, cudf.core.dtypes.ListDtype): + # TODO: Add level based checks too once casting of + # list columns is supported + if isinstance(to_dtype, cudf.core.dtypes.ListDtype): + return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type) + else: + return False + elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype): + if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype): + return True + elif isinstance(to_dtype, np.dtype): + return np.can_cast(from_dtype._categories.dtype, to_dtype) + else: + return False + else: + return np.can_cast(from_dtype, to_dtype) # Type dispatch loops similar to what are found in `np.add.types`